6065-page-hash-use-a-static-inline-instead-of-a-macro Wdiff usr/src/uts/common/vm/vm_page.c

Print this page

6065 page hash: use a static inline instead of a macro

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_page.c
          +++ new/usr/src/uts/common/vm/vm_page.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  23   24   */
  24   25  
  25   26  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T    */
  26   27  /*        All Rights Reserved   */
  27   28  
  28   29  /*
  29   30   * University Copyright- Copyright (c) 1982, 1986, 1988
  30   31   * The Regents of the University of California
  31   32   * All Rights Reserved
  32   33   *

  33   34   * University Acknowledgment- Portions of this document are derived from
  34   35   * software developed by the University of California, Berkeley, and its
  35   36   * contributors.
  36   37   */
  37   38  
  38   39  /*
  39   40   * VM - physical page management.
  40   41   */
  41   42  
  42   43  #include <sys/types.h>
  43   44  #include <sys/t_lock.h>
  44   45  #include <sys/param.h>
  45   46  #include <sys/systm.h>
  46   47  #include <sys/errno.h>
  47   48  #include <sys/time.h>
  48   49  #include <sys/vnode.h>
  49   50  #include <sys/vm.h>
  50   51  #include <sys/vtrace.h>
  51   52  #include <sys/swap.h>
  52   53  #include <sys/cmn_err.h>
  53   54  #include <sys/tuneable.h>
  54   55  #include <sys/sysmacros.h>
  55   56  #include <sys/cpuvar.h>
  56   57  #include <sys/callb.h>
  57   58  #include <sys/debug.h>
  58   59  #include <sys/tnf_probe.h>
  59   60  #include <sys/condvar_impl.h>
  60   61  #include <sys/mem_config.h>
  61   62  #include <sys/mem_cage.h>
  62   63  #include <sys/kmem.h>
  63   64  #include <sys/atomic.h>
  64   65  #include <sys/strlog.h>
  65   66  #include <sys/mman.h>
  66   67  #include <sys/ontrap.h>
  67   68  #include <sys/lgrp.h>
  68   69  #include <sys/vfs.h>
  69   70  
  70   71  #include <vm/hat.h>
  71   72  #include <vm/anon.h>
  72   73  #include <vm/page.h>
  73   74  #include <vm/seg.h>
  74   75  #include <vm/pvn.h>
  75   76  #include <vm/seg_kmem.h>
  76   77  #include <vm/vm_dep.h>
  77   78  #include <sys/vm_usage.h>
  78   79  #include <fs/fs_subr.h>
  79   80  #include <sys/ddi.h>
  80   81  #include <sys/modctl.h>
  81   82  
  82   83  static pgcnt_t max_page_get;    /* max page_get request size in pages */
  83   84  pgcnt_t total_pages = 0;        /* total number of pages (used by /proc) */
  84   85  
  85   86  /*
  86   87   * freemem_lock protects all freemem variables:
  87   88   * availrmem. Also this lock protects the globals which track the
  88   89   * availrmem changes for accurate kernel footprint calculation.
  89   90   * See below for an explanation of these
  90   91   * globals.
  91   92   */
  92   93  kmutex_t freemem_lock;
  93   94  pgcnt_t availrmem;
  94   95  pgcnt_t availrmem_initial;
  95   96  
  96   97  /*
  97   98   * These globals track availrmem changes to get a more accurate
  98   99   * estimate of tke kernel size. Historically pp_kernel is used for
  99  100   * kernel size and is based on availrmem. But availrmem is adjusted for
 100  101   * locked pages in the system not just for kernel locked pages.
 101  102   * These new counters will track the pages locked through segvn and
 102  103   * by explicit user locking.
 103  104   *
 104  105   * pages_locked : How many pages are locked because of user specified
 105  106   * locking through mlock or plock.
 106  107   *
 107  108   * pages_useclaim,pages_claimed : These two variables track the
 108  109   * claim adjustments because of the protection changes on a segvn segment.
 109  110   *
 110  111   * All these globals are protected by the same lock which protects availrmem.
 111  112   */
 112  113  pgcnt_t pages_locked = 0;
 113  114  pgcnt_t pages_useclaim = 0;
 114  115  pgcnt_t pages_claimed = 0;
 115  116  
 116  117  
 117  118  /*
 118  119   * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
 119  120   */
 120  121  static kmutex_t new_freemem_lock;
 121  122  static uint_t   freemem_wait;   /* someone waiting for freemem */
 122  123  static kcondvar_t freemem_cv;
 123  124  
 124  125  /*
 125  126   * The logical page free list is maintained as two lists, the 'free'
 126  127   * and the 'cache' lists.
 127  128   * The free list contains those pages that should be reused first.
 128  129   *
 129  130   * The implementation of the lists is machine dependent.
 130  131   * page_get_freelist(), page_get_cachelist(),
 131  132   * page_list_sub(), and page_list_add()
 132  133   * form the interface to the machine dependent implementation.
 133  134   *
 134  135   * Pages with p_free set are on the cache list.
 135  136   * Pages with p_free and p_age set are on the free list,
 136  137   *
 137  138   * A page may be locked while on either list.
 138  139   */
 139  140  
 140  141  /*
 141  142   * free list accounting stuff.
 142  143   *
 143  144   *
 144  145   * Spread out the value for the number of pages on the
 145  146   * page free and page cache lists.  If there is just one
 146  147   * value, then it must be under just one lock.
 147  148   * The lock contention and cache traffic are a real bother.
 148  149   *
 149  150   * When we acquire and then drop a single pcf lock
 150  151   * we can start in the middle of the array of pcf structures.
 151  152   * If we acquire more than one pcf lock at a time, we need to
 152  153   * start at the front to avoid deadlocking.
 153  154   *
 154  155   * pcf_count holds the number of pages in each pool.
 155  156   *
 156  157   * pcf_block is set when page_create_get_something() has asked the
 157  158   * PSM page freelist and page cachelist routines without specifying
 158  159   * a color and nothing came back.  This is used to block anything
 159  160   * else from moving pages from one list to the other while the
 160  161   * lists are searched again.  If a page is freeed while pcf_block is
 161  162   * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
 162  163   * of clearning pcf_block, doing the wakeups, etc.
 163  164   */
 164  165  
 165  166  #define MAX_PCF_FANOUT NCPU
 166  167  static uint_t pcf_fanout = 1; /* Will get changed at boot time */
 167  168  static uint_t pcf_fanout_mask = 0;
 168  169  
 169  170  struct pcf {
 170  171          kmutex_t        pcf_lock;       /* protects the structure */
 171  172          uint_t          pcf_count;      /* page count */
 172  173          uint_t          pcf_wait;       /* number of waiters */
 173  174          uint_t          pcf_block;      /* pcgs flag to page_free() */
 174  175          uint_t          pcf_reserve;    /* pages freed after pcf_block set */
 175  176          uint_t          pcf_fill[10];   /* to line up on the caches */
 176  177  };
 177  178  
 178  179  /*
 179  180   * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
 180  181   * it will hash the cpu to).  This is done to prevent a drain condition
 181  182   * from happening.  This drain condition will occur when pcf_count decrement
 182  183   * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
 183  184   * example of this shows up with device interrupts.  The dma buffer is allocated
 184  185   * by the cpu requesting the IO thus the pcf_count is decremented based on that.
 185  186   * When the memory is returned by the interrupt thread, the pcf_count will be
 186  187   * incremented based on the cpu servicing the interrupt.
 187  188   */
 188  189  static struct pcf pcf[MAX_PCF_FANOUT];
 189  190  #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
 190  191          (randtick() >> 24)) & (pcf_fanout_mask))
 191  192  
 192  193  static int pcf_decrement_bucket(pgcnt_t);
 193  194  static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
 194  195  
 195  196  kmutex_t        pcgs_lock;              /* serializes page_create_get_ */
 196  197  kmutex_t        pcgs_cagelock;          /* serializes NOSLEEP cage allocs */
 197  198  kmutex_t        pcgs_wait_lock;         /* used for delay in pcgs */
 198  199  static kcondvar_t       pcgs_cv;        /* cv for delay in pcgs */
 199  200  
 200  201  #ifdef VM_STATS
 201  202  
 202  203  /*
 203  204   * No locks, but so what, they are only statistics.
 204  205   */
 205  206  
 206  207  static struct page_tcnt {
 207  208          int     pc_free_cache;          /* free's into cache list */
 208  209          int     pc_free_dontneed;       /* free's with dontneed */
 209  210          int     pc_free_pageout;        /* free's from pageout */
 210  211          int     pc_free_free;           /* free's into free list */
 211  212          int     pc_free_pages;          /* free's into large page free list */
 212  213          int     pc_destroy_pages;       /* large page destroy's */
 213  214          int     pc_get_cache;           /* get's from cache list */
 214  215          int     pc_get_free;            /* get's from free list */
 215  216          int     pc_reclaim;             /* reclaim's */
 216  217          int     pc_abortfree;           /* abort's of free pages */
 217  218          int     pc_find_hit;            /* find's that find page */
 218  219          int     pc_find_miss;           /* find's that don't find page */
 219  220          int     pc_destroy_free;        /* # of free pages destroyed */
 220  221  #define PC_HASH_CNT     (4*PAGE_HASHAVELEN)
 221  222          int     pc_find_hashlen[PC_HASH_CNT+1];
 222  223          int     pc_addclaim_pages;
 223  224          int     pc_subclaim_pages;
 224  225          int     pc_free_replacement_page[2];
 225  226          int     pc_try_demote_pages[6];
 226  227          int     pc_demote_pages[2];
 227  228  } pagecnt;
 228  229  
 229  230  uint_t  hashin_count;
 230  231  uint_t  hashin_not_held;
 231  232  uint_t  hashin_already;
 232  233  
 233  234  uint_t  hashout_count;
 234  235  uint_t  hashout_not_held;
 235  236  
 236  237  uint_t  page_create_count;
 237  238  uint_t  page_create_not_enough;
 238  239  uint_t  page_create_not_enough_again;
 239  240  uint_t  page_create_zero;
 240  241  uint_t  page_create_hashout;
 241  242  uint_t  page_create_page_lock_failed;
 242  243  uint_t  page_create_trylock_failed;
 243  244  uint_t  page_create_found_one;
 244  245  uint_t  page_create_hashin_failed;
 245  246  uint_t  page_create_dropped_phm;
 246  247  
 247  248  uint_t  page_create_new;
 248  249  uint_t  page_create_exists;
 249  250  uint_t  page_create_putbacks;
 250  251  uint_t  page_create_overshoot;
 251  252  
 252  253  uint_t  page_reclaim_zero;
 253  254  uint_t  page_reclaim_zero_locked;
 254  255  
 255  256  uint_t  page_rename_exists;
 256  257  uint_t  page_rename_count;
 257  258  
 258  259  uint_t  page_lookup_cnt[20];
 259  260  uint_t  page_lookup_nowait_cnt[10];

↓ open down ↓

227 lines elided

↑ open up ↑

 260  261  uint_t  page_find_cnt;
 261  262  uint_t  page_exists_cnt;
 262  263  uint_t  page_exists_forreal_cnt;
 263  264  uint_t  page_lookup_dev_cnt;
 264  265  uint_t  get_cachelist_cnt;
 265  266  uint_t  page_create_cnt[10];
 266  267  uint_t  alloc_pages[9];
 267  268  uint_t  page_exphcontg[19];
 268  269  uint_t  page_create_large_cnt[10];
 269  270  
 270      -/*
 271      - * Collects statistics.
 272      - */
 273      -#define PAGE_HASH_SEARCH(index, pp, vp, off) { \
 274      -        uint_t  mylen = 0; \
 275      -                        \
 276      -        for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
 277      -                if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
 278      -                        break; \
 279      -        } \
 280      -        if ((pp) != NULL) \
 281      -                pagecnt.pc_find_hit++; \
 282      -        else \
 283      -                pagecnt.pc_find_miss++; \
 284      -        if (mylen > PC_HASH_CNT) \
 285      -                mylen = PC_HASH_CNT; \
 286      -        pagecnt.pc_find_hashlen[mylen]++; \
 287      -}
 288      -
 289      -#else   /* VM_STATS */
 290      -
 291      -/*
 292      - * Don't collect statistics
 293      - */
 294      -#define PAGE_HASH_SEARCH(index, pp, vp, off) { \
 295      -        for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
 296      -                if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
 297      -                        break; \
 298      -        } \
 299      -}
      271 +#endif
 300  272  
 301      -#endif  /* VM_STATS */
      273 +static inline page_t *
      274 +page_hash_search(ulong_t index, vnode_t *vnode, u_offset_t off)
      275 +{
      276 +        uint_t mylen = 0;
      277 +        page_t *page;
 302  278  
      279 +        for (page = page_hash[index]; page; page = page->p_hash, mylen++)
      280 +                if (page->p_vnode == vnode && page->p_offset == off)
      281 +                        break;
      282 +
      283 +#ifdef  VM_STATS
      284 +        if (page != NULL)
      285 +                pagecnt.pc_find_hit++;
      286 +        else
      287 +                pagecnt.pc_find_miss++;
      288 +
      289 +        pagecnt.pc_find_hashlen[MIN(mylen, PC_HASH_CNT)]++;
      290 +#endif
      291 +
      292 +        return (page);
      293 +}
 303  294  
 304  295  
 305  296  #ifdef DEBUG
 306  297  #define MEMSEG_SEARCH_STATS
 307  298  #endif
 308  299  
 309  300  #ifdef MEMSEG_SEARCH_STATS
 310  301  struct memseg_stats {
 311  302      uint_t nsearch;
 312  303      uint_t nlastwon;

 313  304      uint_t nhashwon;
 314  305      uint_t nnotfound;
 315  306  } memseg_stats;
 316  307  
 317  308  #define MEMSEG_STAT_INCR(v) \
 318  309          atomic_inc_32(&memseg_stats.v)
 319  310  #else
 320  311  #define MEMSEG_STAT_INCR(x)
 321  312  #endif
 322  313  
 323  314  struct memseg *memsegs;         /* list of memory segments */
 324  315  
 325  316  /*
 326  317   * /etc/system tunable to control large page allocation hueristic.
 327  318   *
 328  319   * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
 329  320   * for large page allocation requests.  If a large page is not readily
 330  321   * avaliable on the local freelists we will go through additional effort
 331  322   * to create a large page, potentially moving smaller pages around to coalesce
 332  323   * larger pages in the local lgroup.
 333  324   * Default value of LPAP_DEFAULT will go to remote freelists if large pages
 334  325   * are not readily available in the local lgroup.
 335  326   */
 336  327  enum lpap {
 337  328          LPAP_DEFAULT,   /* default large page allocation policy */
 338  329          LPAP_LOCAL      /* local large page allocation policy */
 339  330  };
 340  331  
 341  332  enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
 342  333  
 343  334  static void page_init_mem_config(void);
 344  335  static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
 345  336  static void page_do_hashout(page_t *);
 346  337  static void page_capture_init();
 347  338  int page_capture_take_action(page_t *, uint_t, void *);
 348  339  
 349  340  static void page_demote_vp_pages(page_t *);
 350  341  
 351  342  
 352  343  void
 353  344  pcf_init(void)
 354  345  
 355  346  {
 356  347          if (boot_ncpus != -1) {
 357  348                  pcf_fanout = boot_ncpus;
 358  349          } else {
 359  350                  pcf_fanout = max_ncpus;
 360  351          }
 361  352  #ifdef sun4v
 362  353          /*
 363  354           * Force at least 4 buckets if possible for sun4v.
 364  355           */
 365  356          pcf_fanout = MAX(pcf_fanout, 4);
 366  357  #endif /* sun4v */
 367  358  
 368  359          /*
 369  360           * Round up to the nearest power of 2.
 370  361           */
 371  362          pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
 372  363          if (!ISP2(pcf_fanout)) {
 373  364                  pcf_fanout = 1 << highbit(pcf_fanout);
 374  365  
 375  366                  if (pcf_fanout > MAX_PCF_FANOUT) {
 376  367                          pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
 377  368                  }
 378  369          }
 379  370          pcf_fanout_mask = pcf_fanout - 1;
 380  371  }
 381  372  
 382  373  /*
 383  374   * vm subsystem related initialization
 384  375   */
 385  376  void
 386  377  vm_init(void)
 387  378  {
 388  379          boolean_t callb_vm_cpr(void *, int);
 389  380  
 390  381          (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 391  382          page_init_mem_config();
 392  383          page_retire_init();
 393  384          vm_usage_init();
 394  385          page_capture_init();
 395  386  }
 396  387  
 397  388  /*
 398  389   * This function is called at startup and when memory is added or deleted.
 399  390   */
 400  391  void
 401  392  init_pages_pp_maximum()
 402  393  {
 403  394          static pgcnt_t p_min;
 404  395          static pgcnt_t pages_pp_maximum_startup;
 405  396          static pgcnt_t avrmem_delta;
 406  397          static int init_done;
 407  398          static int user_set;    /* true if set in /etc/system */
 408  399  
 409  400          if (init_done == 0) {
 410  401  
 411  402                  /* If the user specified a value, save it */
 412  403                  if (pages_pp_maximum != 0) {
 413  404                          user_set = 1;
 414  405                          pages_pp_maximum_startup = pages_pp_maximum;
 415  406                  }
 416  407  
 417  408                  /*
 418  409                   * Setting of pages_pp_maximum is based first time
 419  410                   * on the value of availrmem just after the start-up
 420  411                   * allocations. To preserve this relationship at run
 421  412                   * time, use a delta from availrmem_initial.
 422  413                   */
 423  414                  ASSERT(availrmem_initial >= availrmem);
 424  415                  avrmem_delta = availrmem_initial - availrmem;
 425  416  
 426  417                  /* The allowable floor of pages_pp_maximum */
 427  418                  p_min = tune.t_minarmem + 100;
 428  419  
 429  420                  /* Make sure we don't come through here again. */
 430  421                  init_done = 1;
 431  422          }
 432  423          /*
 433  424           * Determine pages_pp_maximum, the number of currently available
 434  425           * pages (availrmem) that can't be `locked'. If not set by
 435  426           * the user, we set it to 4% of the currently available memory
 436  427           * plus 4MB.
 437  428           * But we also insist that it be greater than tune.t_minarmem;
 438  429           * otherwise a process could lock down a lot of memory, get swapped
 439  430           * out, and never have enough to get swapped back in.
 440  431           */
 441  432          if (user_set)
 442  433                  pages_pp_maximum = pages_pp_maximum_startup;
 443  434          else
 444  435                  pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
 445  436                      + btop(4 * 1024 * 1024);
 446  437  
 447  438          if (pages_pp_maximum <= p_min) {
 448  439                  pages_pp_maximum = p_min;
 449  440          }
 450  441  }
 451  442  
 452  443  void
 453  444  set_max_page_get(pgcnt_t target_total_pages)
 454  445  {
 455  446          max_page_get = target_total_pages / 2;
 456  447  }
 457  448  
 458  449  static pgcnt_t pending_delete;
 459  450  
 460  451  /*ARGSUSED*/
 461  452  static void
 462  453  page_mem_config_post_add(
 463  454          void *arg,
 464  455          pgcnt_t delta_pages)
 465  456  {
 466  457          set_max_page_get(total_pages - pending_delete);
 467  458          init_pages_pp_maximum();
 468  459  }
 469  460  
 470  461  /*ARGSUSED*/
 471  462  static int
 472  463  page_mem_config_pre_del(
 473  464          void *arg,
 474  465          pgcnt_t delta_pages)
 475  466  {
 476  467          pgcnt_t nv;
 477  468  
 478  469          nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
 479  470          set_max_page_get(total_pages - nv);
 480  471          return (0);
 481  472  }
 482  473  
 483  474  /*ARGSUSED*/
 484  475  static void
 485  476  page_mem_config_post_del(
 486  477          void *arg,
 487  478          pgcnt_t delta_pages,
 488  479          int cancelled)
 489  480  {
 490  481          pgcnt_t nv;
 491  482  
 492  483          nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
 493  484          set_max_page_get(total_pages - nv);
 494  485          if (!cancelled)
 495  486                  init_pages_pp_maximum();
 496  487  }
 497  488  
 498  489  static kphysm_setup_vector_t page_mem_config_vec = {
 499  490          KPHYSM_SETUP_VECTOR_VERSION,
 500  491          page_mem_config_post_add,
 501  492          page_mem_config_pre_del,
 502  493          page_mem_config_post_del,
 503  494  };
 504  495  
 505  496  static void
 506  497  page_init_mem_config(void)
 507  498  {
 508  499          int ret;
 509  500  
 510  501          ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
 511  502          ASSERT(ret == 0);
 512  503  }
 513  504  
 514  505  /*
 515  506   * Evenly spread out the PCF counters for large free pages
 516  507   */
 517  508  static void
 518  509  page_free_large_ctr(pgcnt_t npages)
 519  510  {
 520  511          static struct pcf       *p = pcf;
 521  512          pgcnt_t                 lump;
 522  513  
 523  514          freemem += npages;
 524  515  
 525  516          lump = roundup(npages, pcf_fanout) / pcf_fanout;
 526  517  
 527  518          while (npages > 0) {
 528  519  
 529  520                  ASSERT(!p->pcf_block);
 530  521  
 531  522                  if (lump < npages) {
 532  523                          p->pcf_count += (uint_t)lump;
 533  524                          npages -= lump;
 534  525                  } else {
 535  526                          p->pcf_count += (uint_t)npages;
 536  527                          npages = 0;
 537  528                  }
 538  529  
 539  530                  ASSERT(!p->pcf_wait);
 540  531  
 541  532                  if (++p > &pcf[pcf_fanout - 1])
 542  533                          p = pcf;
 543  534          }
 544  535  
 545  536          ASSERT(npages == 0);
 546  537  }
 547  538  
 548  539  /*
 549  540   * Add a physical chunk of memory to the system free lists during startup.
 550  541   * Platform specific startup() allocates the memory for the page structs.
 551  542   *
 552  543   * num  - number of page structures
 553  544   * base - page number (pfn) to be associated with the first page.
 554  545   *
 555  546   * Since we are doing this during startup (ie. single threaded), we will
 556  547   * use shortcut routines to avoid any locking overhead while putting all
 557  548   * these pages on the freelists.
 558  549   *
 559  550   * NOTE: Any changes performed to page_free(), must also be performed to
 560  551   *       add_physmem() since this is how we initialize all page_t's at
 561  552   *       boot time.
 562  553   */
 563  554  void
 564  555  add_physmem(
 565  556          page_t  *pp,
 566  557          pgcnt_t num,
 567  558          pfn_t   pnum)
 568  559  {
 569  560          page_t  *root = NULL;
 570  561          uint_t  szc = page_num_pagesizes() - 1;
 571  562          pgcnt_t large = page_get_pagecnt(szc);
 572  563          pgcnt_t cnt = 0;
 573  564  
 574  565          TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
 575  566              "add_physmem:pp %p num %lu", pp, num);
 576  567  
 577  568          /*
 578  569           * Arbitrarily limit the max page_get request
 579  570           * to 1/2 of the page structs we have.
 580  571           */
 581  572          total_pages += num;
 582  573          set_max_page_get(total_pages);
 583  574  
 584  575          PLCNT_MODIFY_MAX(pnum, (long)num);
 585  576  
 586  577          /*
 587  578           * The physical space for the pages array
 588  579           * representing ram pages has already been
 589  580           * allocated.  Here we initialize each lock
 590  581           * in the page structure, and put each on
 591  582           * the free list
 592  583           */
 593  584          for (; num; pp++, pnum++, num--) {
 594  585  
 595  586                  /*
 596  587                   * this needs to fill in the page number
 597  588                   * and do any other arch specific initialization
 598  589                   */
 599  590                  add_physmem_cb(pp, pnum);
 600  591  
 601  592                  pp->p_lckcnt = 0;
 602  593                  pp->p_cowcnt = 0;
 603  594                  pp->p_slckcnt = 0;
 604  595  
 605  596                  /*
 606  597                   * Initialize the page lock as unlocked, since nobody
 607  598                   * can see or access this page yet.
 608  599                   */
 609  600                  pp->p_selock = 0;
 610  601  
 611  602                  /*
 612  603                   * Initialize IO lock
 613  604                   */
 614  605                  page_iolock_init(pp);
 615  606  
 616  607                  /*
 617  608                   * initialize other fields in the page_t
 618  609                   */
 619  610                  PP_SETFREE(pp);
 620  611                  page_clr_all_props(pp);
 621  612                  PP_SETAGED(pp);
 622  613                  pp->p_offset = (u_offset_t)-1;
 623  614                  pp->p_next = pp;
 624  615                  pp->p_prev = pp;
 625  616  
 626  617                  /*
 627  618                   * Simple case: System doesn't support large pages.
 628  619                   */
 629  620                  if (szc == 0) {
 630  621                          pp->p_szc = 0;
 631  622                          page_free_at_startup(pp);
 632  623                          continue;
 633  624                  }
 634  625  
 635  626                  /*
 636  627                   * Handle unaligned pages, we collect them up onto
 637  628                   * the root page until we have a full large page.
 638  629                   */
 639  630                  if (!IS_P2ALIGNED(pnum, large)) {
 640  631  
 641  632                          /*
 642  633                           * If not in a large page,
 643  634                           * just free as small page.
 644  635                           */
 645  636                          if (root == NULL) {
 646  637                                  pp->p_szc = 0;
 647  638                                  page_free_at_startup(pp);
 648  639                                  continue;
 649  640                          }
 650  641  
 651  642                          /*
 652  643                           * Link a constituent page into the large page.
 653  644                           */
 654  645                          pp->p_szc = szc;
 655  646                          page_list_concat(&root, &pp);
 656  647  
 657  648                          /*
 658  649                           * When large page is fully formed, free it.
 659  650                           */
 660  651                          if (++cnt == large) {
 661  652                                  page_free_large_ctr(cnt);
 662  653                                  page_list_add_pages(root, PG_LIST_ISINIT);
 663  654                                  root = NULL;
 664  655                                  cnt = 0;
 665  656                          }
 666  657                          continue;
 667  658                  }
 668  659  
 669  660                  /*
 670  661                   * At this point we have a page number which
 671  662                   * is aligned. We assert that we aren't already
 672  663                   * in a different large page.
 673  664                   */
 674  665                  ASSERT(IS_P2ALIGNED(pnum, large));
 675  666                  ASSERT(root == NULL && cnt == 0);
 676  667  
 677  668                  /*
 678  669                   * If insufficient number of pages left to form
 679  670                   * a large page, just free the small page.
 680  671                   */
 681  672                  if (num < large) {
 682  673                          pp->p_szc = 0;
 683  674                          page_free_at_startup(pp);
 684  675                          continue;
 685  676                  }
 686  677  
 687  678                  /*
 688  679                   * Otherwise start a new large page.
 689  680                   */
 690  681                  pp->p_szc = szc;
 691  682                  cnt++;
 692  683                  root = pp;
 693  684          }
 694  685          ASSERT(root == NULL && cnt == 0);
 695  686  }
 696  687  
 697  688  /*
 698  689   * Find a page representing the specified [vp, offset].
 699  690   * If we find the page but it is intransit coming in,
 700  691   * it will have an "exclusive" lock and we wait for
 701  692   * the i/o to complete.  A page found on the free list
 702  693   * is always reclaimed and then locked.  On success, the page
 703  694   * is locked, its data is valid and it isn't on the free
 704  695   * list, while a NULL is returned if the page doesn't exist.
 705  696   */
 706  697  page_t *
 707  698  page_lookup(vnode_t *vp, u_offset_t off, se_t se)
 708  699  {
 709  700          return (page_lookup_create(vp, off, se, NULL, NULL, 0));
 710  701  }
 711  702  
 712  703  /*
 713  704   * Find a page representing the specified [vp, offset].
 714  705   * We either return the one we found or, if passed in,
 715  706   * create one with identity of [vp, offset] of the
 716  707   * pre-allocated page. If we find existing page but it is
 717  708   * intransit coming in, it will have an "exclusive" lock
 718  709   * and we wait for the i/o to complete.  A page found on
 719  710   * the free list is always reclaimed and then locked.
 720  711   * On success, the page is locked, its data is valid and
 721  712   * it isn't on the free list, while a NULL is returned
 722  713   * if the page doesn't exist and newpp is NULL;
 723  714   */
 724  715  page_t *
 725  716  page_lookup_create(
 726  717          vnode_t *vp,
 727  718          u_offset_t off,
 728  719          se_t se,
 729  720          page_t *newpp,
 730  721          spgcnt_t *nrelocp,
 731  722          int flags)
 732  723  {
 733  724          page_t          *pp;
 734  725          kmutex_t        *phm;
 735  726          ulong_t         index;
 736  727          uint_t          hash_locked;
 737  728          uint_t          es;
 738  729  
 739  730          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 740  731          VM_STAT_ADD(page_lookup_cnt[0]);
 741  732          ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
 742  733

↓ open down ↓

430 lines elided

↑ open up ↑

 743  734          /*
 744  735           * Acquire the appropriate page hash lock since
 745  736           * we have to search the hash list.  Pages that
 746  737           * hash to this list can't change identity while
 747  738           * this lock is held.
 748  739           */
 749  740          hash_locked = 0;
 750  741          index = PAGE_HASH_FUNC(vp, off);
 751  742          phm = NULL;
 752  743  top:
 753      -        PAGE_HASH_SEARCH(index, pp, vp, off);
      744 +        pp = page_hash_search(index, vp, off);
 754  745          if (pp != NULL) {
 755  746                  VM_STAT_ADD(page_lookup_cnt[1]);
 756  747                  es = (newpp != NULL) ? 1 : 0;
 757  748                  es |= flags;
 758  749                  if (!hash_locked) {
 759  750                          VM_STAT_ADD(page_lookup_cnt[2]);
 760  751                          if (!page_try_reclaim_lock(pp, se, es)) {
 761  752                                  /*
 762  753                                   * On a miss, acquire the phm.  Then
 763  754                                   * next time, page_lock() will be called,

 764  755                                   * causing a wait if the page is busy.
 765  756                                   * just looping with page_trylock() would
 766  757                                   * get pretty boring.
 767  758                                   */
 768  759                                  VM_STAT_ADD(page_lookup_cnt[3]);
 769  760                                  phm = PAGE_HASH_MUTEX(index);
 770  761                                  mutex_enter(phm);
 771  762                                  hash_locked = 1;
 772  763                                  goto top;
 773  764                          }
 774  765                  } else {
 775  766                          VM_STAT_ADD(page_lookup_cnt[4]);
 776  767                          if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {

↓ open down ↓

13 lines elided

↑ open up ↑

 777  768                                  VM_STAT_ADD(page_lookup_cnt[5]);
 778  769                                  goto top;
 779  770                          }
 780  771                  }
 781  772  
 782  773                  /*
 783  774                   * Since `pp' is locked it can not change identity now.
 784  775                   * Reconfirm we locked the correct page.
 785  776                   *
 786  777                   * Both the p_vnode and p_offset *must* be cast volatile
 787      -                 * to force a reload of their values: The PAGE_HASH_SEARCH
 788      -                 * macro will have stuffed p_vnode and p_offset into
      778 +                 * to force a reload of their values: The page_hash_search
      779 +                 * function will have stuffed p_vnode and p_offset into
 789  780                   * registers before calling page_trylock(); another thread,
 790  781                   * actually holding the hash lock, could have changed the
 791  782                   * page's identity in memory, but our registers would not
 792  783                   * be changed, fooling the reconfirmation.  If the hash
 793  784                   * lock was held during the search, the casting would
 794  785                   * not be needed.
 795  786                   */
 796  787                  VM_STAT_ADD(page_lookup_cnt[6]);
 797  788                  if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 798  789                      ((volatile u_offset_t)(pp->p_offset) != off)) {

 799  790                          VM_STAT_ADD(page_lookup_cnt[7]);
 800  791                          if (hash_locked) {
 801  792                                  panic("page_lookup_create: lost page %p",
 802  793                                      (void *)pp);
 803  794                                  /*NOTREACHED*/
 804  795                          }
 805  796                          page_unlock(pp);
 806  797                          phm = PAGE_HASH_MUTEX(index);
 807  798                          mutex_enter(phm);
 808  799                          hash_locked = 1;
 809  800                          goto top;
 810  801                  }
 811  802  
 812  803                  /*
 813  804                   * If page_trylock() was called, then pp may still be on
 814  805                   * the cachelist (can't be on the free list, it would not
 815  806                   * have been found in the search).  If it is on the
 816  807                   * cachelist it must be pulled now. To pull the page from
 817  808                   * the cachelist, it must be exclusively locked.
 818  809                   *
 819  810                   * The other big difference between page_trylock() and
 820  811                   * page_lock(), is that page_lock() will pull the
 821  812                   * page from whatever free list (the cache list in this
 822  813                   * case) the page is on.  If page_trylock() was used
 823  814                   * above, then we have to do the reclaim ourselves.
 824  815                   */
 825  816                  if ((!hash_locked) && (PP_ISFREE(pp))) {
 826  817                          ASSERT(PP_ISAGED(pp) == 0);
 827  818                          VM_STAT_ADD(page_lookup_cnt[8]);
 828  819  
 829  820                          /*
 830  821                           * page_relcaim will insure that we
 831  822                           * have this page exclusively
 832  823                           */
 833  824  
 834  825                          if (!page_reclaim(pp, NULL)) {
 835  826                                  /*
 836  827                                   * Page_reclaim dropped whatever lock
 837  828                                   * we held.
 838  829                                   */
 839  830                                  VM_STAT_ADD(page_lookup_cnt[9]);
 840  831                                  phm = PAGE_HASH_MUTEX(index);
 841  832                                  mutex_enter(phm);
 842  833                                  hash_locked = 1;
 843  834                                  goto top;
 844  835                          } else if (se == SE_SHARED && newpp == NULL) {
 845  836                                  VM_STAT_ADD(page_lookup_cnt[10]);
 846  837                                  page_downgrade(pp);
 847  838                          }
 848  839                  }
 849  840  
 850  841                  if (hash_locked) {
 851  842                          mutex_exit(phm);
 852  843                  }
 853  844  
 854  845                  if (newpp != NULL && pp->p_szc < newpp->p_szc &&
 855  846                      PAGE_EXCL(pp) && nrelocp != NULL) {
 856  847                          ASSERT(nrelocp != NULL);
 857  848                          (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
 858  849                              NULL);
 859  850                          if (*nrelocp > 0) {
 860  851                                  VM_STAT_COND_ADD(*nrelocp == 1,
 861  852                                      page_lookup_cnt[11]);
 862  853                                  VM_STAT_COND_ADD(*nrelocp > 1,
 863  854                                      page_lookup_cnt[12]);
 864  855                                  pp = newpp;
 865  856                                  se = SE_EXCL;
 866  857                          } else {
 867  858                                  if (se == SE_SHARED) {
 868  859                                          page_downgrade(pp);
 869  860                                  }
 870  861                                  VM_STAT_ADD(page_lookup_cnt[13]);
 871  862                          }
 872  863                  } else if (newpp != NULL && nrelocp != NULL) {
 873  864                          if (PAGE_EXCL(pp) && se == SE_SHARED) {
 874  865                                  page_downgrade(pp);
 875  866                          }
 876  867                          VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
 877  868                              page_lookup_cnt[14]);
 878  869                          VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
 879  870                              page_lookup_cnt[15]);
 880  871                          VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
 881  872                              page_lookup_cnt[16]);
 882  873                  } else if (newpp != NULL && PAGE_EXCL(pp)) {
 883  874                          se = SE_EXCL;
 884  875                  }
 885  876          } else if (!hash_locked) {
 886  877                  VM_STAT_ADD(page_lookup_cnt[17]);
 887  878                  phm = PAGE_HASH_MUTEX(index);
 888  879                  mutex_enter(phm);
 889  880                  hash_locked = 1;
 890  881                  goto top;
 891  882          } else if (newpp != NULL) {
 892  883                  /*
 893  884                   * If we have a preallocated page then
 894  885                   * insert it now and basically behave like
 895  886                   * page_create.
 896  887                   */
 897  888                  VM_STAT_ADD(page_lookup_cnt[18]);
 898  889                  /*
 899  890                   * Since we hold the page hash mutex and
 900  891                   * just searched for this page, page_hashin
 901  892                   * had better not fail.  If it does, that
 902  893                   * means some thread did not follow the
 903  894                   * page hash mutex rules.  Panic now and
 904  895                   * get it over with.  As usual, go down
 905  896                   * holding all the locks.
 906  897                   */
 907  898                  ASSERT(MUTEX_HELD(phm));
 908  899                  if (!page_hashin(newpp, vp, off, phm)) {
 909  900                          ASSERT(MUTEX_HELD(phm));
 910  901                          panic("page_lookup_create: hashin failed %p %p %llx %p",
 911  902                              (void *)newpp, (void *)vp, off, (void *)phm);
 912  903                          /*NOTREACHED*/
 913  904                  }
 914  905                  ASSERT(MUTEX_HELD(phm));
 915  906                  mutex_exit(phm);
 916  907                  phm = NULL;
 917  908                  page_set_props(newpp, P_REF);
 918  909                  page_io_lock(newpp);
 919  910                  pp = newpp;
 920  911                  se = SE_EXCL;
 921  912          } else {
 922  913                  VM_STAT_ADD(page_lookup_cnt[19]);
 923  914                  mutex_exit(phm);
 924  915          }
 925  916  
 926  917          ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 927  918  
 928  919          ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
 929  920  
 930  921          return (pp);
 931  922  }
 932  923  
 933  924  /*
 934  925   * Search the hash list for the page representing the
 935  926   * specified [vp, offset] and return it locked.  Skip
 936  927   * free pages and pages that cannot be locked as requested.
 937  928   * Used while attempting to kluster pages.
 938  929   */
 939  930  page_t *
 940  931  page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)

↓ open down ↓

142 lines elided

↑ open up ↑

 941  932  {
 942  933          page_t          *pp;
 943  934          kmutex_t        *phm;
 944  935          ulong_t         index;
 945  936          uint_t          locked;
 946  937  
 947  938          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 948  939          VM_STAT_ADD(page_lookup_nowait_cnt[0]);
 949  940  
 950  941          index = PAGE_HASH_FUNC(vp, off);
 951      -        PAGE_HASH_SEARCH(index, pp, vp, off);
      942 +        pp = page_hash_search(index, vp, off);
 952  943          locked = 0;
 953  944          if (pp == NULL) {
 954  945  top:
 955  946                  VM_STAT_ADD(page_lookup_nowait_cnt[1]);
 956  947                  locked = 1;
 957  948                  phm = PAGE_HASH_MUTEX(index);
 958  949                  mutex_enter(phm);
 959      -                PAGE_HASH_SEARCH(index, pp, vp, off);
      950 +                pp = page_hash_search(index, vp, off);
 960  951          }
 961  952  
 962  953          if (pp == NULL || PP_ISFREE(pp)) {
 963  954                  VM_STAT_ADD(page_lookup_nowait_cnt[2]);
 964  955                  pp = NULL;
 965  956          } else {
 966  957                  if (!page_trylock(pp, se)) {
 967  958                          VM_STAT_ADD(page_lookup_nowait_cnt[3]);
 968  959                          pp = NULL;
 969  960                  } else {

 970  961                          VM_STAT_ADD(page_lookup_nowait_cnt[4]);
 971  962                          /*
 972  963                           * See the comment in page_lookup()
 973  964                           */
 974  965                          if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 975  966                              ((u_offset_t)(pp->p_offset) != off)) {
 976  967                                  VM_STAT_ADD(page_lookup_nowait_cnt[5]);
 977  968                                  if (locked) {
 978  969                                          panic("page_lookup_nowait %p",
 979  970                                              (void *)pp);
 980  971                                          /*NOTREACHED*/
 981  972                                  }
 982  973                                  page_unlock(pp);
 983  974                                  goto top;
 984  975                          }
 985  976                          if (PP_ISFREE(pp)) {
 986  977                                  VM_STAT_ADD(page_lookup_nowait_cnt[6]);
 987  978                                  page_unlock(pp);
 988  979                                  pp = NULL;
 989  980                          }
 990  981                  }
 991  982          }
 992  983          if (locked) {
 993  984                  VM_STAT_ADD(page_lookup_nowait_cnt[7]);
 994  985                  mutex_exit(phm);
 995  986          }
 996  987  
 997  988          ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 998  989  
 999  990          return (pp);
1000  991  }
1001  992  
1002  993  /*
1003  994   * Search the hash list for a page with the specified [vp, off]
1004  995   * that is known to exist and is already locked.  This routine
1005  996   * is typically used by segment SOFTUNLOCK routines.
1006  997   */
1007  998  page_t *
1008  999  page_find(vnode_t *vp, u_offset_t off)
1009 1000  {
1010 1001          page_t          *pp;

↓ open down ↓

41 lines elided

↑ open up ↑

1011 1002          kmutex_t        *phm;
1012 1003          ulong_t         index;
1013 1004  
1014 1005          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1015 1006          VM_STAT_ADD(page_find_cnt);
1016 1007  
1017 1008          index = PAGE_HASH_FUNC(vp, off);
1018 1009          phm = PAGE_HASH_MUTEX(index);
1019 1010  
1020 1011          mutex_enter(phm);
1021      -        PAGE_HASH_SEARCH(index, pp, vp, off);
     1012 +        pp = page_hash_search(index, vp, off);
1022 1013          mutex_exit(phm);
1023 1014  
1024 1015          ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1025 1016          return (pp);
1026 1017  }
1027 1018  
1028 1019  /*
1029 1020   * Determine whether a page with the specified [vp, off]
1030 1021   * currently exists in the system.  Obviously this should
1031 1022   * only be considered as a hint since nothing prevents the
1032 1023   * page from disappearing or appearing immediately after
1033 1024   * the return from this routine. Subsequently, we don't
1034 1025   * even bother to lock the list.
1035 1026   */
1036 1027  page_t *
1037 1028  page_exists(vnode_t *vp, u_offset_t off)
1038 1029  {
1039      -        page_t  *pp;
1040 1030          ulong_t         index;
1041 1031  
1042 1032          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1043 1033          VM_STAT_ADD(page_exists_cnt);
1044 1034  
1045 1035          index = PAGE_HASH_FUNC(vp, off);
1046      -        PAGE_HASH_SEARCH(index, pp, vp, off);
1047 1036  
1048      -        return (pp);
     1037 +        return (page_hash_search(index, vp, off));
1049 1038  }
1050 1039  
1051 1040  /*
1052 1041   * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1053 1042   * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1054 1043   * with these pages locked SHARED. If necessary reclaim pages from
1055 1044   * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1056 1045   *
1057 1046   * If we fail to lock pages still return 1 if pages exist and contiguous.
1058 1047   * But in this case return value is just a hint. ppa array won't be filled.

1059 1048   * Caller should initialize ppa[0] as NULL to distinguish return value.
1060 1049   *
1061 1050   * Returns 0 if pages don't exist or not physically contiguous.
1062 1051   *
1063 1052   * This routine doesn't work for anonymous(swapfs) pages.
1064 1053   */
1065 1054  int
1066 1055  page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1067 1056  {
1068 1057          pgcnt_t pages;
1069 1058          pfn_t pfn;
1070 1059          page_t *rootpp;
1071 1060          pgcnt_t i;
1072 1061          pgcnt_t j;
1073 1062          u_offset_t save_off = off;
1074 1063          ulong_t index;
1075 1064          kmutex_t *phm;
1076 1065          page_t *pp;
1077 1066          uint_t pszc;
1078 1067          int loopcnt = 0;
1079 1068  
1080 1069          ASSERT(szc != 0);
1081 1070          ASSERT(vp != NULL);
1082 1071          ASSERT(!IS_SWAPFSVP(vp));
1083 1072          ASSERT(!VN_ISKAS(vp));
1084 1073

↓ open down ↓

26 lines elided

↑ open up ↑

1085 1074  again:
1086 1075          if (++loopcnt > 3) {
1087 1076                  VM_STAT_ADD(page_exphcontg[0]);
1088 1077                  return (0);
1089 1078          }
1090 1079  
1091 1080          index = PAGE_HASH_FUNC(vp, off);
1092 1081          phm = PAGE_HASH_MUTEX(index);
1093 1082  
1094 1083          mutex_enter(phm);
1095      -        PAGE_HASH_SEARCH(index, pp, vp, off);
     1084 +        pp = page_hash_search(index, vp, off);
1096 1085          mutex_exit(phm);
1097 1086  
1098 1087          VM_STAT_ADD(page_exphcontg[1]);
1099 1088  
1100 1089          if (pp == NULL) {
1101 1090                  VM_STAT_ADD(page_exphcontg[2]);
1102 1091                  return (0);
1103 1092          }
1104 1093  
1105 1094          pages = page_get_pagecnt(szc);

1106 1095          rootpp = pp;
1107 1096          pfn = rootpp->p_pagenum;
1108 1097  
1109 1098          if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1110 1099                  VM_STAT_ADD(page_exphcontg[3]);
1111 1100                  if (!page_trylock(pp, SE_SHARED)) {
1112 1101                          VM_STAT_ADD(page_exphcontg[4]);
1113 1102                          return (1);
1114 1103                  }
1115 1104                  /*
1116 1105                   * Also check whether p_pagenum was modified by DR.
1117 1106                   */
1118 1107                  if (pp->p_szc != pszc || pp->p_vnode != vp ||
1119 1108                      pp->p_offset != off || pp->p_pagenum != pfn) {
1120 1109                          VM_STAT_ADD(page_exphcontg[5]);
1121 1110                          page_unlock(pp);
1122 1111                          off = save_off;
1123 1112                          goto again;
1124 1113                  }
1125 1114                  /*
1126 1115                   * szc was non zero and vnode and offset matched after we
1127 1116                   * locked the page it means it can't become free on us.
1128 1117                   */
1129 1118                  ASSERT(!PP_ISFREE(pp));
1130 1119                  if (!IS_P2ALIGNED(pfn, pages)) {
1131 1120                          page_unlock(pp);
1132 1121                          return (0);
1133 1122                  }
1134 1123                  ppa[0] = pp;
1135 1124                  pp++;
1136 1125                  off += PAGESIZE;
1137 1126                  pfn++;
1138 1127                  for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1139 1128                          if (!page_trylock(pp, SE_SHARED)) {
1140 1129                                  VM_STAT_ADD(page_exphcontg[6]);
1141 1130                                  pp--;
1142 1131                                  while (i-- > 0) {
1143 1132                                          page_unlock(pp);
1144 1133                                          pp--;
1145 1134                                  }
1146 1135                                  ppa[0] = NULL;
1147 1136                                  return (1);
1148 1137                          }
1149 1138                          if (pp->p_szc != pszc) {
1150 1139                                  VM_STAT_ADD(page_exphcontg[7]);
1151 1140                                  page_unlock(pp);
1152 1141                                  pp--;
1153 1142                                  while (i-- > 0) {
1154 1143                                          page_unlock(pp);
1155 1144                                          pp--;
1156 1145                                  }
1157 1146                                  ppa[0] = NULL;
1158 1147                                  off = save_off;
1159 1148                                  goto again;
1160 1149                          }
1161 1150                          /*
1162 1151                           * szc the same as for previous already locked pages
1163 1152                           * with right identity. Since this page had correct
1164 1153                           * szc after we locked it can't get freed or destroyed
1165 1154                           * and therefore must have the expected identity.
1166 1155                           */
1167 1156                          ASSERT(!PP_ISFREE(pp));
1168 1157                          if (pp->p_vnode != vp ||
1169 1158                              pp->p_offset != off) {
1170 1159                                  panic("page_exists_physcontig: "
1171 1160                                      "large page identity doesn't match");
1172 1161                          }
1173 1162                          ppa[i] = pp;
1174 1163                          ASSERT(pp->p_pagenum == pfn);
1175 1164                  }
1176 1165                  VM_STAT_ADD(page_exphcontg[8]);
1177 1166                  ppa[pages] = NULL;
1178 1167                  return (1);
1179 1168          } else if (pszc >= szc) {
1180 1169                  VM_STAT_ADD(page_exphcontg[9]);
1181 1170                  if (!IS_P2ALIGNED(pfn, pages)) {
1182 1171                          return (0);
1183 1172                  }
1184 1173                  return (1);
1185 1174          }
1186 1175  
1187 1176          if (!IS_P2ALIGNED(pfn, pages)) {
1188 1177                  VM_STAT_ADD(page_exphcontg[10]);
1189 1178                  return (0);
1190 1179          }
1191 1180  
1192 1181          if (page_numtomemseg_nolock(pfn) !=
1193 1182              page_numtomemseg_nolock(pfn + pages - 1)) {
1194 1183                  VM_STAT_ADD(page_exphcontg[11]);
1195 1184                  return (0);
1196 1185          }
1197 1186  
1198 1187          /*
1199 1188           * We loop up 4 times across pages to promote page size.
1200 1189           * We're extra cautious to promote page size atomically with respect
1201 1190           * to everybody else.  But we can probably optimize into 1 loop if
1202 1191           * this becomes an issue.
1203 1192           */
1204 1193  
1205 1194          for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1206 1195                  if (!page_trylock(pp, SE_EXCL)) {
1207 1196                          VM_STAT_ADD(page_exphcontg[12]);
1208 1197                          break;
1209 1198                  }
1210 1199                  /*
1211 1200                   * Check whether p_pagenum was modified by DR.
1212 1201                   */
1213 1202                  if (pp->p_pagenum != pfn) {
1214 1203                          page_unlock(pp);
1215 1204                          break;
1216 1205                  }
1217 1206                  if (pp->p_vnode != vp ||
1218 1207                      pp->p_offset != off) {
1219 1208                          VM_STAT_ADD(page_exphcontg[13]);
1220 1209                          page_unlock(pp);
1221 1210                          break;
1222 1211                  }
1223 1212                  if (pp->p_szc >= szc) {
1224 1213                          ASSERT(i == 0);
1225 1214                          page_unlock(pp);
1226 1215                          off = save_off;
1227 1216                          goto again;
1228 1217                  }
1229 1218          }
1230 1219  
1231 1220          if (i != pages) {
1232 1221                  VM_STAT_ADD(page_exphcontg[14]);
1233 1222                  --pp;
1234 1223                  while (i-- > 0) {
1235 1224                          page_unlock(pp);
1236 1225                          --pp;
1237 1226                  }
1238 1227                  return (0);
1239 1228          }
1240 1229  
1241 1230          pp = rootpp;
1242 1231          for (i = 0; i < pages; i++, pp++) {
1243 1232                  if (PP_ISFREE(pp)) {
1244 1233                          VM_STAT_ADD(page_exphcontg[15]);
1245 1234                          ASSERT(!PP_ISAGED(pp));
1246 1235                          ASSERT(pp->p_szc == 0);
1247 1236                          if (!page_reclaim(pp, NULL)) {
1248 1237                                  break;
1249 1238                          }
1250 1239                  } else {
1251 1240                          ASSERT(pp->p_szc < szc);
1252 1241                          VM_STAT_ADD(page_exphcontg[16]);
1253 1242                          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1254 1243                  }
1255 1244          }
1256 1245          if (i < pages) {
1257 1246                  VM_STAT_ADD(page_exphcontg[17]);
1258 1247                  /*
1259 1248                   * page_reclaim failed because we were out of memory.
1260 1249                   * drop the rest of the locks and return because this page
1261 1250                   * must be already reallocated anyway.
1262 1251                   */
1263 1252                  pp = rootpp;
1264 1253                  for (j = 0; j < pages; j++, pp++) {
1265 1254                          if (j != i) {
1266 1255                                  page_unlock(pp);
1267 1256                          }
1268 1257                  }
1269 1258                  return (0);
1270 1259          }
1271 1260  
1272 1261          off = save_off;
1273 1262          pp = rootpp;
1274 1263          for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1275 1264                  ASSERT(PAGE_EXCL(pp));
1276 1265                  ASSERT(!PP_ISFREE(pp));
1277 1266                  ASSERT(!hat_page_is_mapped(pp));
1278 1267                  ASSERT(pp->p_vnode == vp);
1279 1268                  ASSERT(pp->p_offset == off);
1280 1269                  pp->p_szc = szc;
1281 1270          }
1282 1271          pp = rootpp;
1283 1272          for (i = 0; i < pages; i++, pp++) {
1284 1273                  if (ppa == NULL) {
1285 1274                          page_unlock(pp);
1286 1275                  } else {
1287 1276                          ppa[i] = pp;
1288 1277                          page_downgrade(ppa[i]);
1289 1278                  }
1290 1279          }
1291 1280          if (ppa != NULL) {
1292 1281                  ppa[pages] = NULL;
1293 1282          }
1294 1283          VM_STAT_ADD(page_exphcontg[18]);
1295 1284          ASSERT(vp->v_pages != NULL);
1296 1285          return (1);
1297 1286  }
1298 1287  
1299 1288  /*
1300 1289   * Determine whether a page with the specified [vp, off]
1301 1290   * currently exists in the system and if so return its
1302 1291   * size code. Obviously this should only be considered as
1303 1292   * a hint since nothing prevents the page from disappearing
1304 1293   * or appearing immediately after the return from this routine.
1305 1294   */
1306 1295  int
1307 1296  page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1308 1297  {
1309 1298          page_t          *pp;
1310 1299          kmutex_t        *phm;
1311 1300          ulong_t         index;

↓ open down ↓

206 lines elided

↑ open up ↑

1312 1301          int             rc = 0;
1313 1302  
1314 1303          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1315 1304          ASSERT(szc != NULL);
1316 1305          VM_STAT_ADD(page_exists_forreal_cnt);
1317 1306  
1318 1307          index = PAGE_HASH_FUNC(vp, off);
1319 1308          phm = PAGE_HASH_MUTEX(index);
1320 1309  
1321 1310          mutex_enter(phm);
1322      -        PAGE_HASH_SEARCH(index, pp, vp, off);
     1311 +        pp = page_hash_search(index, vp, off);
1323 1312          if (pp != NULL) {
1324 1313                  *szc = pp->p_szc;
1325 1314                  rc = 1;
1326 1315          }
1327 1316          mutex_exit(phm);
1328 1317          return (rc);
1329 1318  }
1330 1319  
1331 1320  /* wakeup threads waiting for pages in page_create_get_something() */
1332 1321  void

1333 1322  wakeup_pcgs(void)
1334 1323  {
1335 1324          if (!CV_HAS_WAITERS(&pcgs_cv))
1336 1325                  return;
1337 1326          cv_broadcast(&pcgs_cv);
1338 1327  }
1339 1328  
1340 1329  /*
1341 1330   * 'freemem' is used all over the kernel as an indication of how many
1342 1331   * pages are free (either on the cache list or on the free page list)
1343 1332   * in the system.  In very few places is a really accurate 'freemem'
1344 1333   * needed.  To avoid contention of the lock protecting a the
1345 1334   * single freemem, it was spread out into NCPU buckets.  Set_freemem
1346 1335   * sets freemem to the total of all NCPU buckets.  It is called from
1347 1336   * clock() on each TICK.
1348 1337   */
1349 1338  void
1350 1339  set_freemem()
1351 1340  {
1352 1341          struct pcf      *p;
1353 1342          ulong_t         t;
1354 1343          uint_t          i;
1355 1344  
1356 1345          t = 0;
1357 1346          p = pcf;
1358 1347          for (i = 0;  i < pcf_fanout; i++) {
1359 1348                  t += p->pcf_count;
1360 1349                  p++;
1361 1350          }
1362 1351          freemem = t;
1363 1352  
1364 1353          /*
1365 1354           * Don't worry about grabbing mutex.  It's not that
1366 1355           * critical if we miss a tick or two.  This is
1367 1356           * where we wakeup possible delayers in
1368 1357           * page_create_get_something().
1369 1358           */
1370 1359          wakeup_pcgs();
1371 1360  }
1372 1361  
1373 1362  ulong_t
1374 1363  get_freemem()
1375 1364  {
1376 1365          struct pcf      *p;
1377 1366          ulong_t         t;
1378 1367          uint_t          i;
1379 1368  
1380 1369          t = 0;
1381 1370          p = pcf;
1382 1371          for (i = 0; i < pcf_fanout; i++) {
1383 1372                  t += p->pcf_count;
1384 1373                  p++;
1385 1374          }
1386 1375          /*
1387 1376           * We just calculated it, might as well set it.
1388 1377           */
1389 1378          freemem = t;
1390 1379          return (t);
1391 1380  }
1392 1381  
1393 1382  /*
1394 1383   * Acquire all of the page cache & free (pcf) locks.
1395 1384   */
1396 1385  void
1397 1386  pcf_acquire_all()
1398 1387  {
1399 1388          struct pcf      *p;
1400 1389          uint_t          i;
1401 1390  
1402 1391          p = pcf;
1403 1392          for (i = 0; i < pcf_fanout; i++) {
1404 1393                  mutex_enter(&p->pcf_lock);
1405 1394                  p++;
1406 1395          }
1407 1396  }
1408 1397  
1409 1398  /*
1410 1399   * Release all the pcf_locks.
1411 1400   */
1412 1401  void
1413 1402  pcf_release_all()
1414 1403  {
1415 1404          struct pcf      *p;
1416 1405          uint_t          i;
1417 1406  
1418 1407          p = pcf;
1419 1408          for (i = 0; i < pcf_fanout; i++) {
1420 1409                  mutex_exit(&p->pcf_lock);
1421 1410                  p++;
1422 1411          }
1423 1412  }
1424 1413  
1425 1414  /*
1426 1415   * Inform the VM system that we need some pages freed up.
1427 1416   * Calls must be symmetric, e.g.:
1428 1417   *
1429 1418   *      page_needfree(100);
1430 1419   *      wait a bit;
1431 1420   *      page_needfree(-100);
1432 1421   */
1433 1422  void
1434 1423  page_needfree(spgcnt_t npages)
1435 1424  {
1436 1425          mutex_enter(&new_freemem_lock);
1437 1426          needfree += npages;
1438 1427          mutex_exit(&new_freemem_lock);
1439 1428  }
1440 1429  
1441 1430  /*
1442 1431   * Throttle for page_create(): try to prevent freemem from dropping
1443 1432   * below throttlefree.  We can't provide a 100% guarantee because
1444 1433   * KM_NOSLEEP allocations, page_reclaim(), and various other things
1445 1434   * nibble away at the freelist.  However, we can block all PG_WAIT
1446 1435   * allocations until memory becomes available.  The motivation is
1447 1436   * that several things can fall apart when there's no free memory:
1448 1437   *
1449 1438   * (1) If pageout() needs memory to push a page, the system deadlocks.
1450 1439   *
1451 1440   * (2) By (broken) specification, timeout(9F) can neither fail nor
1452 1441   *     block, so it has no choice but to panic the system if it
1453 1442   *     cannot allocate a callout structure.
1454 1443   *
1455 1444   * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1456 1445   *     it panics if it cannot allocate a callback structure.
1457 1446   *
1458 1447   * (4) Untold numbers of third-party drivers have not yet been hardened
1459 1448   *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1460 1449   *     success and panic the system with a data fault on failure.
1461 1450   *     (The long-term solution to this particular problem is to ship
1462 1451   *     hostile fault-injecting DEBUG kernels with the DDK.)
1463 1452   *
1464 1453   * It is theoretically impossible to guarantee success of non-blocking
1465 1454   * allocations, but in practice, this throttle is very hard to break.
1466 1455   */
1467 1456  static int
1468 1457  page_create_throttle(pgcnt_t npages, int flags)
1469 1458  {
1470 1459          ulong_t fm;
1471 1460          uint_t  i;
1472 1461          pgcnt_t tf;     /* effective value of throttlefree */
1473 1462  
1474 1463          /*
1475 1464           * Normal priority allocations.
1476 1465           */
1477 1466          if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1478 1467                  ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1479 1468                  return (freemem >= npages + throttlefree);
1480 1469          }
1481 1470  
1482 1471          /*
1483 1472           * Never deny pages when:
1484 1473           * - it's a thread that cannot block [NOMEMWAIT()]
1485 1474           * - the allocation cannot block and must not fail
1486 1475           * - the allocation cannot block and is pageout dispensated
1487 1476           */
1488 1477          if (NOMEMWAIT() ||
1489 1478              ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1490 1479              ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1491 1480                  return (1);
1492 1481  
1493 1482          /*
1494 1483           * If the allocation can't block, we look favorably upon it
1495 1484           * unless we're below pageout_reserve.  In that case we fail
1496 1485           * the allocation because we want to make sure there are a few
1497 1486           * pages available for pageout.
1498 1487           */
1499 1488          if ((flags & PG_WAIT) == 0)
1500 1489                  return (freemem >= npages + pageout_reserve);
1501 1490  
1502 1491          /* Calculate the effective throttlefree value */
1503 1492          tf = throttlefree -
1504 1493              ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1505 1494  
1506 1495          cv_signal(&proc_pageout->p_cv);
1507 1496  
1508 1497          for (;;) {
1509 1498                  fm = 0;
1510 1499                  pcf_acquire_all();
1511 1500                  mutex_enter(&new_freemem_lock);
1512 1501                  for (i = 0; i < pcf_fanout; i++) {
1513 1502                          fm += pcf[i].pcf_count;
1514 1503                          pcf[i].pcf_wait++;
1515 1504                          mutex_exit(&pcf[i].pcf_lock);
1516 1505                  }
1517 1506                  freemem = fm;
1518 1507                  if (freemem >= npages + tf) {
1519 1508                          mutex_exit(&new_freemem_lock);
1520 1509                          break;
1521 1510                  }
1522 1511                  needfree += npages;
1523 1512                  freemem_wait++;
1524 1513                  cv_wait(&freemem_cv, &new_freemem_lock);
1525 1514                  freemem_wait--;
1526 1515                  needfree -= npages;
1527 1516                  mutex_exit(&new_freemem_lock);
1528 1517          }
1529 1518          return (1);
1530 1519  }
1531 1520  
1532 1521  /*
1533 1522   * page_create_wait() is called to either coalesce pages from the
1534 1523   * different pcf buckets or to wait because there simply are not
1535 1524   * enough pages to satisfy the caller's request.
1536 1525   *
1537 1526   * Sadly, this is called from platform/vm/vm_machdep.c
1538 1527   */
1539 1528  int
1540 1529  page_create_wait(pgcnt_t npages, uint_t flags)
1541 1530  {
1542 1531          pgcnt_t         total;
1543 1532          uint_t          i;
1544 1533          struct pcf      *p;
1545 1534  
1546 1535          /*
1547 1536           * Wait until there are enough free pages to satisfy our
1548 1537           * entire request.
1549 1538           * We set needfree += npages before prodding pageout, to make sure
1550 1539           * it does real work when npages > lotsfree > freemem.
1551 1540           */
1552 1541          VM_STAT_ADD(page_create_not_enough);
1553 1542  
1554 1543          ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1555 1544  checkagain:
1556 1545          if ((flags & PG_NORELOC) &&
1557 1546              kcage_freemem < kcage_throttlefree + npages)
1558 1547                  (void) kcage_create_throttle(npages, flags);
1559 1548  
1560 1549          if (freemem < npages + throttlefree)
1561 1550                  if (!page_create_throttle(npages, flags))
1562 1551                          return (0);
1563 1552  
1564 1553          if (pcf_decrement_bucket(npages) ||
1565 1554              pcf_decrement_multiple(&total, npages, 0))
1566 1555                  return (1);
1567 1556  
1568 1557          /*
1569 1558           * All of the pcf locks are held, there are not enough pages
1570 1559           * to satisfy the request (npages < total).
1571 1560           * Be sure to acquire the new_freemem_lock before dropping
1572 1561           * the pcf locks.  This prevents dropping wakeups in page_free().
1573 1562           * The order is always pcf_lock then new_freemem_lock.
1574 1563           *
1575 1564           * Since we hold all the pcf locks, it is a good time to set freemem.
1576 1565           *
1577 1566           * If the caller does not want to wait, return now.
1578 1567           * Else turn the pageout daemon loose to find something
1579 1568           * and wait till it does.
1580 1569           *
1581 1570           */
1582 1571          freemem = total;
1583 1572  
1584 1573          if ((flags & PG_WAIT) == 0) {
1585 1574                  pcf_release_all();
1586 1575  
1587 1576                  TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1588 1577                  "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1589 1578                  return (0);
1590 1579          }
1591 1580  
1592 1581          ASSERT(proc_pageout != NULL);
1593 1582          cv_signal(&proc_pageout->p_cv);
1594 1583  
1595 1584          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1596 1585              "page_create_sleep_start: freemem %ld needfree %ld",
1597 1586              freemem, needfree);
1598 1587  
1599 1588          /*
1600 1589           * We are going to wait.
1601 1590           * We currently hold all of the pcf_locks,
1602 1591           * get the new_freemem_lock (it protects freemem_wait),
1603 1592           * before dropping the pcf_locks.
1604 1593           */
1605 1594          mutex_enter(&new_freemem_lock);
1606 1595  
1607 1596          p = pcf;
1608 1597          for (i = 0; i < pcf_fanout; i++) {
1609 1598                  p->pcf_wait++;
1610 1599                  mutex_exit(&p->pcf_lock);
1611 1600                  p++;
1612 1601          }
1613 1602  
1614 1603          needfree += npages;
1615 1604          freemem_wait++;
1616 1605  
1617 1606          cv_wait(&freemem_cv, &new_freemem_lock);
1618 1607  
1619 1608          freemem_wait--;
1620 1609          needfree -= npages;
1621 1610  
1622 1611          mutex_exit(&new_freemem_lock);
1623 1612  
1624 1613          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1625 1614              "page_create_sleep_end: freemem %ld needfree %ld",
1626 1615              freemem, needfree);
1627 1616  
1628 1617          VM_STAT_ADD(page_create_not_enough_again);
1629 1618          goto checkagain;
1630 1619  }
1631 1620  /*
1632 1621   * A routine to do the opposite of page_create_wait().
1633 1622   */
1634 1623  void
1635 1624  page_create_putback(spgcnt_t npages)
1636 1625  {
1637 1626          struct pcf      *p;
1638 1627          pgcnt_t         lump;
1639 1628          uint_t          *which;
1640 1629  
1641 1630          /*
1642 1631           * When a contiguous lump is broken up, we have to
1643 1632           * deal with lots of pages (min 64) so lets spread
1644 1633           * the wealth around.
1645 1634           */
1646 1635          lump = roundup(npages, pcf_fanout) / pcf_fanout;
1647 1636          freemem += npages;
1648 1637  
1649 1638          for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1650 1639                  which = &p->pcf_count;
1651 1640  
1652 1641                  mutex_enter(&p->pcf_lock);
1653 1642  
1654 1643                  if (p->pcf_block) {
1655 1644                          which = &p->pcf_reserve;
1656 1645                  }
1657 1646  
1658 1647                  if (lump < npages) {
1659 1648                          *which += (uint_t)lump;
1660 1649                          npages -= lump;
1661 1650                  } else {
1662 1651                          *which += (uint_t)npages;
1663 1652                          npages = 0;
1664 1653                  }
1665 1654  
1666 1655                  if (p->pcf_wait) {
1667 1656                          mutex_enter(&new_freemem_lock);
1668 1657                          /*
1669 1658                           * Check to see if some other thread
1670 1659                           * is actually waiting.  Another bucket
1671 1660                           * may have woken it up by now.  If there
1672 1661                           * are no waiters, then set our pcf_wait
1673 1662                           * count to zero to avoid coming in here
1674 1663                           * next time.
1675 1664                           */
1676 1665                          if (freemem_wait) {
1677 1666                                  if (npages > 1) {
1678 1667                                          cv_broadcast(&freemem_cv);
1679 1668                                  } else {
1680 1669                                          cv_signal(&freemem_cv);
1681 1670                                  }
1682 1671                                  p->pcf_wait--;
1683 1672                          } else {
1684 1673                                  p->pcf_wait = 0;
1685 1674                          }
1686 1675                          mutex_exit(&new_freemem_lock);
1687 1676                  }
1688 1677                  mutex_exit(&p->pcf_lock);
1689 1678          }
1690 1679          ASSERT(npages == 0);
1691 1680  }
1692 1681  
1693 1682  /*
1694 1683   * A helper routine for page_create_get_something.
1695 1684   * The indenting got to deep down there.
1696 1685   * Unblock the pcf counters.  Any pages freed after
1697 1686   * pcf_block got set are moved to pcf_count and
1698 1687   * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1699 1688   */
1700 1689  static void
1701 1690  pcgs_unblock(void)
1702 1691  {
1703 1692          int             i;
1704 1693          struct pcf      *p;
1705 1694  
1706 1695          /* Update freemem while we're here. */
1707 1696          freemem = 0;
1708 1697          p = pcf;
1709 1698          for (i = 0; i < pcf_fanout; i++) {
1710 1699                  mutex_enter(&p->pcf_lock);
1711 1700                  ASSERT(p->pcf_count == 0);
1712 1701                  p->pcf_count = p->pcf_reserve;
1713 1702                  p->pcf_block = 0;
1714 1703                  freemem += p->pcf_count;
1715 1704                  if (p->pcf_wait) {
1716 1705                          mutex_enter(&new_freemem_lock);
1717 1706                          if (freemem_wait) {
1718 1707                                  if (p->pcf_reserve > 1) {
1719 1708                                          cv_broadcast(&freemem_cv);
1720 1709                                          p->pcf_wait = 0;
1721 1710                                  } else {
1722 1711                                          cv_signal(&freemem_cv);
1723 1712                                          p->pcf_wait--;
1724 1713                                  }
1725 1714                          } else {
1726 1715                                  p->pcf_wait = 0;
1727 1716                          }
1728 1717                          mutex_exit(&new_freemem_lock);
1729 1718                  }
1730 1719                  p->pcf_reserve = 0;
1731 1720                  mutex_exit(&p->pcf_lock);
1732 1721                  p++;
1733 1722          }
1734 1723  }
1735 1724  
1736 1725  /*
1737 1726   * Called from page_create_va() when both the cache and free lists
1738 1727   * have been checked once.
1739 1728   *
1740 1729   * Either returns a page or panics since the accounting was done
1741 1730   * way before we got here.
1742 1731   *
1743 1732   * We don't come here often, so leave the accounting on permanently.
1744 1733   */
1745 1734  
1746 1735  #define MAX_PCGS        100
1747 1736  
1748 1737  #ifdef  DEBUG
1749 1738  #define PCGS_TRIES      100
1750 1739  #else   /* DEBUG */
1751 1740  #define PCGS_TRIES      10
1752 1741  #endif  /* DEBUG */
1753 1742  
1754 1743  #ifdef  VM_STATS
1755 1744  uint_t  pcgs_counts[PCGS_TRIES];
1756 1745  uint_t  pcgs_too_many;
1757 1746  uint_t  pcgs_entered;
1758 1747  uint_t  pcgs_entered_noreloc;
1759 1748  uint_t  pcgs_locked;
1760 1749  uint_t  pcgs_cagelocked;
1761 1750  #endif  /* VM_STATS */
1762 1751  
1763 1752  static page_t *
1764 1753  page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1765 1754      caddr_t vaddr, uint_t flags)
1766 1755  {
1767 1756          uint_t          count;
1768 1757          page_t          *pp;
1769 1758          uint_t          locked, i;
1770 1759          struct  pcf     *p;
1771 1760          lgrp_t          *lgrp;
1772 1761          int             cagelocked = 0;
1773 1762  
1774 1763          VM_STAT_ADD(pcgs_entered);
1775 1764  
1776 1765          /*
1777 1766           * Tap any reserve freelists: if we fail now, we'll die
1778 1767           * since the page(s) we're looking for have already been
1779 1768           * accounted for.
1780 1769           */
1781 1770          flags |= PG_PANIC;
1782 1771  
1783 1772          if ((flags & PG_NORELOC) != 0) {
1784 1773                  VM_STAT_ADD(pcgs_entered_noreloc);
1785 1774                  /*
1786 1775                   * Requests for free pages from critical threads
1787 1776                   * such as pageout still won't throttle here, but
1788 1777                   * we must try again, to give the cageout thread
1789 1778                   * another chance to catch up. Since we already
1790 1779                   * accounted for the pages, we had better get them
1791 1780                   * this time.
1792 1781                   *
1793 1782                   * N.B. All non-critical threads acquire the pcgs_cagelock
1794 1783                   * to serialize access to the freelists. This implements a
1795 1784                   * turnstile-type synchornization to avoid starvation of
1796 1785                   * critical requests for PG_NORELOC memory by non-critical
1797 1786                   * threads: all non-critical threads must acquire a 'ticket'
1798 1787                   * before passing through, which entails making sure
1799 1788                   * kcage_freemem won't fall below minfree prior to grabbing
1800 1789                   * pages from the freelists.
1801 1790                   */
1802 1791                  if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1803 1792                          mutex_enter(&pcgs_cagelock);
1804 1793                          cagelocked = 1;
1805 1794                          VM_STAT_ADD(pcgs_cagelocked);
1806 1795                  }
1807 1796          }
1808 1797  
1809 1798          /*
1810 1799           * Time to get serious.
1811 1800           * We failed to get a `correctly colored' page from both the
1812 1801           * free and cache lists.
1813 1802           * We escalate in stage.
1814 1803           *
1815 1804           * First try both lists without worring about color.
1816 1805           *
1817 1806           * Then, grab all page accounting locks (ie. pcf[]) and
1818 1807           * steal any pages that they have and set the pcf_block flag to
1819 1808           * stop deletions from the lists.  This will help because
1820 1809           * a page can get added to the free list while we are looking
1821 1810           * at the cache list, then another page could be added to the cache
1822 1811           * list allowing the page on the free list to be removed as we
1823 1812           * move from looking at the cache list to the free list. This
1824 1813           * could happen over and over. We would never find the page
1825 1814           * we have accounted for.
1826 1815           *
1827 1816           * Noreloc pages are a subset of the global (relocatable) page pool.
1828 1817           * They are not tracked separately in the pcf bins, so it is
1829 1818           * impossible to know when doing pcf accounting if the available
1830 1819           * page(s) are noreloc pages or not. When looking for a noreloc page
1831 1820           * it is quite easy to end up here even if the global (relocatable)
1832 1821           * page pool has plenty of free pages but the noreloc pool is empty.
1833 1822           *
1834 1823           * When the noreloc pool is empty (or low), additional noreloc pages
1835 1824           * are created by converting pages from the global page pool. This
1836 1825           * process will stall during pcf accounting if the pcf bins are
1837 1826           * already locked. Such is the case when a noreloc allocation is
1838 1827           * looping here in page_create_get_something waiting for more noreloc
1839 1828           * pages to appear.
1840 1829           *
1841 1830           * Short of adding a new field to the pcf bins to accurately track
1842 1831           * the number of free noreloc pages, we instead do not grab the
1843 1832           * pcgs_lock, do not set the pcf blocks and do not timeout when
1844 1833           * allocating a noreloc page. This allows noreloc allocations to
1845 1834           * loop without blocking global page pool allocations.
1846 1835           *
1847 1836           * NOTE: the behaviour of page_create_get_something has not changed
1848 1837           * for the case of global page pool allocations.
1849 1838           */
1850 1839  
1851 1840          flags &= ~PG_MATCH_COLOR;
1852 1841          locked = 0;
1853 1842  #if defined(__i386) || defined(__amd64)
1854 1843          flags = page_create_update_flags_x86(flags);
1855 1844  #endif
1856 1845  
1857 1846          lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1858 1847  
1859 1848          for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1860 1849                  pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1861 1850                      flags, lgrp);
1862 1851                  if (pp == NULL) {
1863 1852                          pp = page_get_cachelist(vp, off, seg, vaddr,
1864 1853                              flags, lgrp);
1865 1854                  }
1866 1855                  if (pp == NULL) {
1867 1856                          /*
1868 1857                           * Serialize.  Don't fight with other pcgs().
1869 1858                           */
1870 1859                          if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1871 1860                                  mutex_enter(&pcgs_lock);
1872 1861                                  VM_STAT_ADD(pcgs_locked);
1873 1862                                  locked = 1;
1874 1863                                  p = pcf;
1875 1864                                  for (i = 0; i < pcf_fanout; i++) {
1876 1865                                          mutex_enter(&p->pcf_lock);
1877 1866                                          ASSERT(p->pcf_block == 0);
1878 1867                                          p->pcf_block = 1;
1879 1868                                          p->pcf_reserve = p->pcf_count;
1880 1869                                          p->pcf_count = 0;
1881 1870                                          mutex_exit(&p->pcf_lock);
1882 1871                                          p++;
1883 1872                                  }
1884 1873                                  freemem = 0;
1885 1874                          }
1886 1875  
1887 1876                          if (count) {
1888 1877                                  /*
1889 1878                                   * Since page_free() puts pages on
1890 1879                                   * a list then accounts for it, we
1891 1880                                   * just have to wait for page_free()
1892 1881                                   * to unlock any page it was working
1893 1882                                   * with. The page_lock()-page_reclaim()
1894 1883                                   * path falls in the same boat.
1895 1884                                   *
1896 1885                                   * We don't need to check on the
1897 1886                                   * PG_WAIT flag, we have already
1898 1887                                   * accounted for the page we are
1899 1888                                   * looking for in page_create_va().
1900 1889                                   *
1901 1890                                   * We just wait a moment to let any
1902 1891                                   * locked pages on the lists free up,
1903 1892                                   * then continue around and try again.
1904 1893                                   *
1905 1894                                   * Will be awakened by set_freemem().
1906 1895                                   */
1907 1896                                  mutex_enter(&pcgs_wait_lock);
1908 1897                                  cv_wait(&pcgs_cv, &pcgs_wait_lock);
1909 1898                                  mutex_exit(&pcgs_wait_lock);
1910 1899                          }
1911 1900                  } else {
1912 1901  #ifdef VM_STATS
1913 1902                          if (count >= PCGS_TRIES) {
1914 1903                                  VM_STAT_ADD(pcgs_too_many);
1915 1904                          } else {
1916 1905                                  VM_STAT_ADD(pcgs_counts[count]);
1917 1906                          }
1918 1907  #endif
1919 1908                          if (locked) {
1920 1909                                  pcgs_unblock();
1921 1910                                  mutex_exit(&pcgs_lock);
1922 1911                          }
1923 1912                          if (cagelocked)
1924 1913                                  mutex_exit(&pcgs_cagelock);
1925 1914                          return (pp);
1926 1915                  }
1927 1916          }
1928 1917          /*
1929 1918           * we go down holding the pcf locks.
1930 1919           */
1931 1920          panic("no %spage found %d",
1932 1921              ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1933 1922          /*NOTREACHED*/
1934 1923  }
1935 1924  
1936 1925  /*
1937 1926   * Create enough pages for "bytes" worth of data starting at
1938 1927   * "off" in "vp".
1939 1928   *
1940 1929   *      Where flag must be one of:
1941 1930   *
1942 1931   *              PG_EXCL:        Exclusive create (fail if any page already
1943 1932   *                              exists in the page cache) which does not
1944 1933   *                              wait for memory to become available.
1945 1934   *
1946 1935   *              PG_WAIT:        Non-exclusive create which can wait for
1947 1936   *                              memory to become available.
1948 1937   *
1949 1938   *              PG_PHYSCONTIG:  Allocate physically contiguous pages.
1950 1939   *                              (Not Supported)
1951 1940   *
1952 1941   * A doubly linked list of pages is returned to the caller.  Each page
1953 1942   * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1954 1943   * lock.
1955 1944   *
1956 1945   * Unable to change the parameters to page_create() in a minor release,
1957 1946   * we renamed page_create() to page_create_va(), changed all known calls
1958 1947   * from page_create() to page_create_va(), and created this wrapper.
1959 1948   *
1960 1949   * Upon a major release, we should break compatibility by deleting this
1961 1950   * wrapper, and replacing all the strings "page_create_va", with "page_create".
1962 1951   *
1963 1952   * NOTE: There is a copy of this interface as page_create_io() in
1964 1953   *       i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1965 1954   *       there.
1966 1955   */
1967 1956  page_t *
1968 1957  page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1969 1958  {
1970 1959          caddr_t random_vaddr;
1971 1960          struct seg kseg;
1972 1961  
1973 1962  #ifdef DEBUG
1974 1963          cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1975 1964              (void *)caller());
1976 1965  #endif
1977 1966  
1978 1967          random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1979 1968              (uintptr_t)(off >> PAGESHIFT));
1980 1969          kseg.s_as = &kas;
1981 1970  
1982 1971          return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1983 1972  }
1984 1973  
1985 1974  #ifdef DEBUG
1986 1975  uint32_t pg_alloc_pgs_mtbf = 0;
1987 1976  #endif
1988 1977  
1989 1978  /*
1990 1979   * Used for large page support. It will attempt to allocate
1991 1980   * a large page(s) off the freelist.
1992 1981   *
1993 1982   * Returns non zero on failure.
1994 1983   */
1995 1984  int
1996 1985  page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1997 1986      page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
1998 1987  {
1999 1988          pgcnt_t         npgs, curnpgs, totpgs;
2000 1989          size_t          pgsz;
2001 1990          page_t          *pplist = NULL, *pp;
2002 1991          int             err = 0;
2003 1992          lgrp_t          *lgrp;
2004 1993  
2005 1994          ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2006 1995          ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
2007 1996  
2008 1997          /*
2009 1998           * Check if system heavily prefers local large pages over remote
2010 1999           * on systems with multiple lgroups.
2011 2000           */
2012 2001          if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2013 2002                  pgflags = PG_LOCAL;
2014 2003          }
2015 2004  
2016 2005          VM_STAT_ADD(alloc_pages[0]);
2017 2006  
2018 2007  #ifdef DEBUG
2019 2008          if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2020 2009                  return (ENOMEM);
2021 2010          }
2022 2011  #endif
2023 2012  
2024 2013          /*
2025 2014           * One must be NULL but not both.
2026 2015           * And one must be non NULL but not both.
2027 2016           */
2028 2017          ASSERT(basepp != NULL || ppa != NULL);
2029 2018          ASSERT(basepp == NULL || ppa == NULL);
2030 2019  
2031 2020  #if defined(__i386) || defined(__amd64)
2032 2021          while (page_chk_freelist(szc) == 0) {
2033 2022                  VM_STAT_ADD(alloc_pages[8]);
2034 2023                  if (anypgsz == 0 || --szc == 0)
2035 2024                          return (ENOMEM);
2036 2025          }
2037 2026  #endif
2038 2027  
2039 2028          pgsz = page_get_pagesize(szc);
2040 2029          totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2041 2030  
2042 2031          ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2043 2032  
2044 2033          (void) page_create_wait(npgs, PG_WAIT);
2045 2034  
2046 2035          while (npgs && szc) {
2047 2036                  lgrp = lgrp_mem_choose(seg, addr, pgsz);
2048 2037                  if (pgflags == PG_LOCAL) {
2049 2038                          pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2050 2039                              pgflags, lgrp);
2051 2040                          if (pp == NULL) {
2052 2041                                  pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2053 2042                                      0, lgrp);
2054 2043                          }
2055 2044                  } else {
2056 2045                          pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2057 2046                              0, lgrp);
2058 2047                  }
2059 2048                  if (pp != NULL) {
2060 2049                          VM_STAT_ADD(alloc_pages[1]);
2061 2050                          page_list_concat(&pplist, &pp);
2062 2051                          ASSERT(npgs >= curnpgs);
2063 2052                          npgs -= curnpgs;
2064 2053                  } else if (anypgsz) {
2065 2054                          VM_STAT_ADD(alloc_pages[2]);
2066 2055                          szc--;
2067 2056                          pgsz = page_get_pagesize(szc);
2068 2057                          curnpgs = pgsz >> PAGESHIFT;
2069 2058                  } else {
2070 2059                          VM_STAT_ADD(alloc_pages[3]);
2071 2060                          ASSERT(npgs == totpgs);
2072 2061                          page_create_putback(npgs);
2073 2062                          return (ENOMEM);
2074 2063                  }
2075 2064          }
2076 2065          if (szc == 0) {
2077 2066                  VM_STAT_ADD(alloc_pages[4]);
2078 2067                  ASSERT(npgs != 0);
2079 2068                  page_create_putback(npgs);
2080 2069                  err = ENOMEM;
2081 2070          } else if (basepp != NULL) {
2082 2071                  ASSERT(npgs == 0);
2083 2072                  ASSERT(ppa == NULL);
2084 2073                  *basepp = pplist;
2085 2074          }
2086 2075  
2087 2076          npgs = totpgs - npgs;
2088 2077          pp = pplist;
2089 2078  
2090 2079          /*
2091 2080           * Clear the free and age bits. Also if we were passed in a ppa then
2092 2081           * fill it in with all the constituent pages from the large page. But
2093 2082           * if we failed to allocate all the pages just free what we got.
2094 2083           */
2095 2084          while (npgs != 0) {
2096 2085                  ASSERT(PP_ISFREE(pp));
2097 2086                  ASSERT(PP_ISAGED(pp));
2098 2087                  if (ppa != NULL || err != 0) {
2099 2088                          if (err == 0) {
2100 2089                                  VM_STAT_ADD(alloc_pages[5]);
2101 2090                                  PP_CLRFREE(pp);
2102 2091                                  PP_CLRAGED(pp);
2103 2092                                  page_sub(&pplist, pp);
2104 2093                                  *ppa++ = pp;
2105 2094                                  npgs--;
2106 2095                          } else {
2107 2096                                  VM_STAT_ADD(alloc_pages[6]);
2108 2097                                  ASSERT(pp->p_szc != 0);
2109 2098                                  curnpgs = page_get_pagecnt(pp->p_szc);
2110 2099                                  page_list_break(&pp, &pplist, curnpgs);
2111 2100                                  page_list_add_pages(pp, 0);
2112 2101                                  page_create_putback(curnpgs);
2113 2102                                  ASSERT(npgs >= curnpgs);
2114 2103                                  npgs -= curnpgs;
2115 2104                          }
2116 2105                          pp = pplist;
2117 2106                  } else {
2118 2107                          VM_STAT_ADD(alloc_pages[7]);
2119 2108                          PP_CLRFREE(pp);
2120 2109                          PP_CLRAGED(pp);
2121 2110                          pp = pp->p_next;
2122 2111                          npgs--;
2123 2112                  }
2124 2113          }
2125 2114          return (err);
2126 2115  }
2127 2116  
2128 2117  /*
2129 2118   * Get a single large page off of the freelists, and set it up for use.
2130 2119   * Number of bytes requested must be a supported page size.
2131 2120   *
2132 2121   * Note that this call may fail even if there is sufficient
2133 2122   * memory available or PG_WAIT is set, so the caller must
2134 2123   * be willing to fallback on page_create_va(), block and retry,
2135 2124   * or fail the requester.
2136 2125   */
2137 2126  page_t *
2138 2127  page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2139 2128      struct seg *seg, caddr_t vaddr, void *arg)
2140 2129  {
2141 2130          pgcnt_t         npages;
2142 2131          page_t          *pp;
2143 2132          page_t          *rootpp;
2144 2133          lgrp_t          *lgrp;
2145 2134          lgrp_id_t       *lgrpid = (lgrp_id_t *)arg;
2146 2135  
2147 2136          ASSERT(vp != NULL);
2148 2137  
2149 2138          ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2150 2139              PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2151 2140          /* but no others */
2152 2141  
2153 2142          ASSERT((flags & PG_EXCL) == PG_EXCL);
2154 2143  
2155 2144          npages = btop(bytes);
2156 2145  
2157 2146          if (!kcage_on || panicstr) {
2158 2147                  /*
2159 2148                   * Cage is OFF, or we are single threaded in
2160 2149                   * panic, so make everything a RELOC request.
2161 2150                   */
2162 2151                  flags &= ~PG_NORELOC;
2163 2152          }
2164 2153  
2165 2154          /*
2166 2155           * Make sure there's adequate physical memory available.
2167 2156           * Note: PG_WAIT is ignored here.
2168 2157           */
2169 2158          if (freemem <= throttlefree + npages) {
2170 2159                  VM_STAT_ADD(page_create_large_cnt[1]);
2171 2160                  return (NULL);
2172 2161          }
2173 2162  
2174 2163          /*
2175 2164           * If cage is on, dampen draw from cage when available
2176 2165           * cage space is low.
2177 2166           */
2178 2167          if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2179 2168              kcage_freemem < kcage_throttlefree + npages) {
2180 2169  
2181 2170                  /*
2182 2171                   * The cage is on, the caller wants PG_NORELOC
2183 2172                   * pages and available cage memory is very low.
2184 2173                   * Call kcage_create_throttle() to attempt to
2185 2174                   * control demand on the cage.
2186 2175                   */
2187 2176                  if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2188 2177                          VM_STAT_ADD(page_create_large_cnt[2]);
2189 2178                          return (NULL);
2190 2179                  }
2191 2180          }
2192 2181  
2193 2182          if (!pcf_decrement_bucket(npages) &&
2194 2183              !pcf_decrement_multiple(NULL, npages, 1)) {
2195 2184                  VM_STAT_ADD(page_create_large_cnt[4]);
2196 2185                  return (NULL);
2197 2186          }
2198 2187  
2199 2188          /*
2200 2189           * This is where this function behaves fundamentally differently
2201 2190           * than page_create_va(); since we're intending to map the page
2202 2191           * with a single TTE, we have to get it as a physically contiguous
2203 2192           * hardware pagesize chunk.  If we can't, we fail.
2204 2193           */
2205 2194          if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2206 2195              LGRP_EXISTS(lgrp_table[*lgrpid]))
2207 2196                  lgrp = lgrp_table[*lgrpid];
2208 2197          else
2209 2198                  lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2210 2199  
2211 2200          if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2212 2201              bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2213 2202                  page_create_putback(npages);
2214 2203                  VM_STAT_ADD(page_create_large_cnt[5]);
2215 2204                  return (NULL);
2216 2205          }
2217 2206  
2218 2207          /*
2219 2208           * if we got the page with the wrong mtype give it back this is a
2220 2209           * workaround for CR 6249718. When CR 6249718 is fixed we never get
2221 2210           * inside "if" and the workaround becomes just a nop
2222 2211           */
2223 2212          if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2224 2213                  page_list_add_pages(rootpp, 0);
2225 2214                  page_create_putback(npages);
2226 2215                  VM_STAT_ADD(page_create_large_cnt[6]);
2227 2216                  return (NULL);
2228 2217          }
2229 2218  
2230 2219          /*
2231 2220           * If satisfying this request has left us with too little
2232 2221           * memory, start the wheels turning to get some back.  The
2233 2222           * first clause of the test prevents waking up the pageout
2234 2223           * daemon in situations where it would decide that there's
2235 2224           * nothing to do.
2236 2225           */
2237 2226          if (nscan < desscan && freemem < minfree) {
2238 2227                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2239 2228                      "pageout_cv_signal:freemem %ld", freemem);
2240 2229                  cv_signal(&proc_pageout->p_cv);
2241 2230          }
2242 2231  
2243 2232          pp = rootpp;
2244 2233          while (npages--) {
2245 2234                  ASSERT(PAGE_EXCL(pp));
2246 2235                  ASSERT(pp->p_vnode == NULL);
2247 2236                  ASSERT(!hat_page_is_mapped(pp));
2248 2237                  PP_CLRFREE(pp);
2249 2238                  PP_CLRAGED(pp);
2250 2239                  if (!page_hashin(pp, vp, off, NULL))
2251 2240                          panic("page_create_large: hashin failed: page %p",
2252 2241                              (void *)pp);
2253 2242                  page_io_lock(pp);
2254 2243                  off += PAGESIZE;
2255 2244                  pp = pp->p_next;
2256 2245          }
2257 2246  
2258 2247          VM_STAT_ADD(page_create_large_cnt[0]);
2259 2248          return (rootpp);
2260 2249  }
2261 2250  
2262 2251  page_t *
2263 2252  page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2264 2253      struct seg *seg, caddr_t vaddr)
2265 2254  {
2266 2255          page_t          *plist = NULL;
2267 2256          pgcnt_t         npages;
2268 2257          pgcnt_t         found_on_free = 0;
2269 2258          pgcnt_t         pages_req;
2270 2259          page_t          *npp = NULL;
2271 2260          struct pcf      *p;
2272 2261          lgrp_t          *lgrp;
2273 2262  
2274 2263          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2275 2264              "page_create_start:vp %p off %llx bytes %lu flags %x",
2276 2265              vp, off, bytes, flags);
2277 2266  
2278 2267          ASSERT(bytes != 0 && vp != NULL);
2279 2268  
2280 2269          if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2281 2270                  panic("page_create: invalid flags");
2282 2271                  /*NOTREACHED*/
2283 2272          }
2284 2273          ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2285 2274              PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2286 2275              /* but no others */
2287 2276  
2288 2277          pages_req = npages = btopr(bytes);
2289 2278          /*
2290 2279           * Try to see whether request is too large to *ever* be
2291 2280           * satisfied, in order to prevent deadlock.  We arbitrarily
2292 2281           * decide to limit maximum size requests to max_page_get.
2293 2282           */
2294 2283          if (npages >= max_page_get) {
2295 2284                  if ((flags & PG_WAIT) == 0) {
2296 2285                          TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2297 2286                              "page_create_toobig:vp %p off %llx npages "
2298 2287                              "%lu max_page_get %lu",
2299 2288                              vp, off, npages, max_page_get);
2300 2289                          return (NULL);
2301 2290                  } else {
2302 2291                          cmn_err(CE_WARN,
2303 2292                              "Request for too much kernel memory "
2304 2293                              "(%lu bytes), will hang forever", bytes);
2305 2294                          for (;;)
2306 2295                                  delay(1000000000);
2307 2296                  }
2308 2297          }
2309 2298  
2310 2299          if (!kcage_on || panicstr) {
2311 2300                  /*
2312 2301                   * Cage is OFF, or we are single threaded in
2313 2302                   * panic, so make everything a RELOC request.
2314 2303                   */
2315 2304                  flags &= ~PG_NORELOC;
2316 2305          }
2317 2306  
2318 2307          if (freemem <= throttlefree + npages)
2319 2308                  if (!page_create_throttle(npages, flags))
2320 2309                          return (NULL);
2321 2310  
2322 2311          /*
2323 2312           * If cage is on, dampen draw from cage when available
2324 2313           * cage space is low.
2325 2314           */
2326 2315          if ((flags & PG_NORELOC) &&
2327 2316              kcage_freemem < kcage_throttlefree + npages) {
2328 2317  
2329 2318                  /*
2330 2319                   * The cage is on, the caller wants PG_NORELOC
2331 2320                   * pages and available cage memory is very low.
2332 2321                   * Call kcage_create_throttle() to attempt to
2333 2322                   * control demand on the cage.
2334 2323                   */
2335 2324                  if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2336 2325                          return (NULL);
2337 2326          }
2338 2327  
2339 2328          VM_STAT_ADD(page_create_cnt[0]);
2340 2329  
2341 2330          if (!pcf_decrement_bucket(npages)) {
2342 2331                  /*
2343 2332                   * Have to look harder.  If npages is greater than
2344 2333                   * one, then we might have to coalesce the counters.
2345 2334                   *
2346 2335                   * Go wait.  We come back having accounted
2347 2336                   * for the memory.
2348 2337                   */
2349 2338                  VM_STAT_ADD(page_create_cnt[1]);
2350 2339                  if (!page_create_wait(npages, flags)) {
2351 2340                          VM_STAT_ADD(page_create_cnt[2]);
2352 2341                          return (NULL);
2353 2342                  }
2354 2343          }
2355 2344  
2356 2345          TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2357 2346              "page_create_success:vp %p off %llx", vp, off);
2358 2347  
2359 2348          /*
2360 2349           * If satisfying this request has left us with too little
2361 2350           * memory, start the wheels turning to get some back.  The
2362 2351           * first clause of the test prevents waking up the pageout
2363 2352           * daemon in situations where it would decide that there's
2364 2353           * nothing to do.
2365 2354           */
2366 2355          if (nscan < desscan && freemem < minfree) {
2367 2356                  TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2368 2357                      "pageout_cv_signal:freemem %ld", freemem);
2369 2358                  cv_signal(&proc_pageout->p_cv);
2370 2359          }
2371 2360  
2372 2361          /*
2373 2362           * Loop around collecting the requested number of pages.
2374 2363           * Most of the time, we have to `create' a new page. With
2375 2364           * this in mind, pull the page off the free list before
2376 2365           * getting the hash lock.  This will minimize the hash
2377 2366           * lock hold time, nesting, and the like.  If it turns
2378 2367           * out we don't need the page, we put it back at the end.
2379 2368           */
2380 2369          while (npages--) {
2381 2370                  page_t          *pp;
2382 2371                  kmutex_t        *phm = NULL;
2383 2372                  ulong_t         index;
2384 2373  
2385 2374                  index = PAGE_HASH_FUNC(vp, off);
2386 2375  top:
2387 2376                  ASSERT(phm == NULL);
2388 2377                  ASSERT(index == PAGE_HASH_FUNC(vp, off));
2389 2378                  ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2390 2379  
2391 2380                  if (npp == NULL) {
2392 2381                          /*
2393 2382                           * Try to get a page from the freelist (ie,
2394 2383                           * a page with no [vp, off] tag).  If that
2395 2384                           * fails, use the cachelist.
2396 2385                           *
2397 2386                           * During the first attempt at both the free
2398 2387                           * and cache lists we try for the correct color.
2399 2388                           */
2400 2389                          /*
2401 2390                           * XXXX-how do we deal with virtual indexed
2402 2391                           * caches and and colors?
2403 2392                           */
2404 2393                          VM_STAT_ADD(page_create_cnt[4]);
2405 2394                          /*
2406 2395                           * Get lgroup to allocate next page of shared memory
2407 2396                           * from and use it to specify where to allocate
2408 2397                           * the physical memory
2409 2398                           */
2410 2399                          lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2411 2400                          npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2412 2401                              flags | PG_MATCH_COLOR, lgrp);
2413 2402                          if (npp == NULL) {
2414 2403                                  npp = page_get_cachelist(vp, off, seg,
2415 2404                                      vaddr, flags | PG_MATCH_COLOR, lgrp);
2416 2405                                  if (npp == NULL) {
2417 2406                                          npp = page_create_get_something(vp,
2418 2407                                              off, seg, vaddr,
2419 2408                                              flags & ~PG_MATCH_COLOR);
2420 2409                                  }
2421 2410  
2422 2411                                  if (PP_ISAGED(npp) == 0) {
2423 2412                                          /*
2424 2413                                           * Since this page came from the
2425 2414                                           * cachelist, we must destroy the
2426 2415                                           * old vnode association.
2427 2416                                           */
2428 2417                                          page_hashout(npp, NULL);
2429 2418                                  }
2430 2419                          }
2431 2420                  }
2432 2421  
2433 2422                  /*
2434 2423                   * We own this page!
2435 2424                   */
2436 2425                  ASSERT(PAGE_EXCL(npp));
2437 2426                  ASSERT(npp->p_vnode == NULL);
2438 2427                  ASSERT(!hat_page_is_mapped(npp));
2439 2428                  PP_CLRFREE(npp);

↓ open down ↓

1107 lines elided

↑ open up ↑

2440 2429                  PP_CLRAGED(npp);
2441 2430  
2442 2431                  /*
2443 2432                   * Here we have a page in our hot little mits and are
2444 2433                   * just waiting to stuff it on the appropriate lists.
2445 2434                   * Get the mutex and check to see if it really does
2446 2435                   * not exist.
2447 2436                   */
2448 2437                  phm = PAGE_HASH_MUTEX(index);
2449 2438                  mutex_enter(phm);
2450      -                PAGE_HASH_SEARCH(index, pp, vp, off);
     2439 +                pp = page_hash_search(index, vp, off);
2451 2440                  if (pp == NULL) {
2452 2441                          VM_STAT_ADD(page_create_new);
2453 2442                          pp = npp;
2454 2443                          npp = NULL;
2455 2444                          if (!page_hashin(pp, vp, off, phm)) {
2456 2445                                  /*
2457 2446                                   * Since we hold the page hash mutex and
2458 2447                                   * just searched for this page, page_hashin
2459 2448                                   * had better not fail.  If it does, that
2460 2449                                   * means somethread did not follow the

2461 2450                                   * page hash mutex rules.  Panic now and
2462 2451                                   * get it over with.  As usual, go down
2463 2452                                   * holding all the locks.
2464 2453                                   */
2465 2454                                  ASSERT(MUTEX_HELD(phm));
2466 2455                                  panic("page_create: "
2467 2456                                      "hashin failed %p %p %llx %p",
2468 2457                                      (void *)pp, (void *)vp, off, (void *)phm);
2469 2458                                  /*NOTREACHED*/
2470 2459                          }
2471 2460                          ASSERT(MUTEX_HELD(phm));
2472 2461                          mutex_exit(phm);
2473 2462                          phm = NULL;
2474 2463  
2475 2464                          /*
2476 2465                           * Hat layer locking need not be done to set
2477 2466                           * the following bits since the page is not hashed
2478 2467                           * and was on the free list (i.e., had no mappings).
2479 2468                           *
2480 2469                           * Set the reference bit to protect
2481 2470                           * against immediate pageout
2482 2471                           *
2483 2472                           * XXXmh modify freelist code to set reference
2484 2473                           * bit so we don't have to do it here.
2485 2474                           */
2486 2475                          page_set_props(pp, P_REF);
2487 2476                          found_on_free++;
2488 2477                  } else {
2489 2478                          VM_STAT_ADD(page_create_exists);
2490 2479                          if (flags & PG_EXCL) {
2491 2480                                  /*
2492 2481                                   * Found an existing page, and the caller
2493 2482                                   * wanted all new pages.  Undo all of the work
2494 2483                                   * we have done.
2495 2484                                   */
2496 2485                                  mutex_exit(phm);
2497 2486                                  phm = NULL;
2498 2487                                  while (plist != NULL) {
2499 2488                                          pp = plist;
2500 2489                                          page_sub(&plist, pp);
2501 2490                                          page_io_unlock(pp);
2502 2491                                          /* large pages should not end up here */
2503 2492                                          ASSERT(pp->p_szc == 0);
2504 2493                                          /*LINTED: constant in conditional ctx*/
2505 2494                                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
2506 2495                                  }
2507 2496                                  VM_STAT_ADD(page_create_found_one);
2508 2497                                  goto fail;
2509 2498                          }
2510 2499                          ASSERT(flags & PG_WAIT);
2511 2500                          if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2512 2501                                  /*
2513 2502                                   * Start all over again if we blocked trying
2514 2503                                   * to lock the page.
2515 2504                                   */
2516 2505                                  mutex_exit(phm);
2517 2506                                  VM_STAT_ADD(page_create_page_lock_failed);
2518 2507                                  phm = NULL;
2519 2508                                  goto top;
2520 2509                          }
2521 2510                          mutex_exit(phm);
2522 2511                          phm = NULL;
2523 2512  
2524 2513                          if (PP_ISFREE(pp)) {
2525 2514                                  ASSERT(PP_ISAGED(pp) == 0);
2526 2515                                  VM_STAT_ADD(pagecnt.pc_get_cache);
2527 2516                                  page_list_sub(pp, PG_CACHE_LIST);
2528 2517                                  PP_CLRFREE(pp);
2529 2518                                  found_on_free++;
2530 2519                          }
2531 2520                  }
2532 2521  
2533 2522                  /*
2534 2523                   * Got a page!  It is locked.  Acquire the i/o
2535 2524                   * lock since we are going to use the p_next and
2536 2525                   * p_prev fields to link the requested pages together.
2537 2526                   */
2538 2527                  page_io_lock(pp);
2539 2528                  page_add(&plist, pp);
2540 2529                  plist = plist->p_next;
2541 2530                  off += PAGESIZE;
2542 2531                  vaddr += PAGESIZE;
2543 2532          }
2544 2533  
2545 2534          ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2546 2535  fail:
2547 2536          if (npp != NULL) {
2548 2537                  /*
2549 2538                   * Did not need this page after all.
2550 2539                   * Put it back on the free list.
2551 2540                   */
2552 2541                  VM_STAT_ADD(page_create_putbacks);
2553 2542                  PP_SETFREE(npp);
2554 2543                  PP_SETAGED(npp);
2555 2544                  npp->p_offset = (u_offset_t)-1;
2556 2545                  page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2557 2546                  page_unlock(npp);
2558 2547  
2559 2548          }
2560 2549  
2561 2550          ASSERT(pages_req >= found_on_free);
2562 2551  
2563 2552          {
2564 2553                  uint_t overshoot = (uint_t)(pages_req - found_on_free);
2565 2554  
2566 2555                  if (overshoot) {
2567 2556                          VM_STAT_ADD(page_create_overshoot);
2568 2557                          p = &pcf[PCF_INDEX()];
2569 2558                          mutex_enter(&p->pcf_lock);
2570 2559                          if (p->pcf_block) {
2571 2560                                  p->pcf_reserve += overshoot;
2572 2561                          } else {
2573 2562                                  p->pcf_count += overshoot;
2574 2563                                  if (p->pcf_wait) {
2575 2564                                          mutex_enter(&new_freemem_lock);
2576 2565                                          if (freemem_wait) {
2577 2566                                                  cv_signal(&freemem_cv);
2578 2567                                                  p->pcf_wait--;
2579 2568                                          } else {
2580 2569                                                  p->pcf_wait = 0;
2581 2570                                          }
2582 2571                                          mutex_exit(&new_freemem_lock);
2583 2572                                  }
2584 2573                          }
2585 2574                          mutex_exit(&p->pcf_lock);
2586 2575                          /* freemem is approximate, so this test OK */
2587 2576                          if (!p->pcf_block)
2588 2577                                  freemem += overshoot;
2589 2578                  }
2590 2579          }
2591 2580  
2592 2581          return (plist);
2593 2582  }
2594 2583  
2595 2584  /*
2596 2585   * One or more constituent pages of this large page has been marked
2597 2586   * toxic. Simply demote the large page to PAGESIZE pages and let
2598 2587   * page_free() handle it. This routine should only be called by
2599 2588   * large page free routines (page_free_pages() and page_destroy_pages().
2600 2589   * All pages are locked SE_EXCL and have already been marked free.
2601 2590   */
2602 2591  static void
2603 2592  page_free_toxic_pages(page_t *rootpp)
2604 2593  {
2605 2594          page_t  *tpp;
2606 2595          pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2607 2596          uint_t  szc = rootpp->p_szc;
2608 2597  
2609 2598          for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2610 2599                  ASSERT(tpp->p_szc == szc);
2611 2600                  ASSERT((PAGE_EXCL(tpp) &&
2612 2601                      !page_iolock_assert(tpp)) || panicstr);
2613 2602                  tpp->p_szc = 0;
2614 2603          }
2615 2604  
2616 2605          while (rootpp != NULL) {
2617 2606                  tpp = rootpp;
2618 2607                  page_sub(&rootpp, tpp);
2619 2608                  ASSERT(PP_ISFREE(tpp));
2620 2609                  PP_CLRFREE(tpp);
2621 2610                  page_free(tpp, 1);
2622 2611          }
2623 2612  }
2624 2613  
2625 2614  /*
2626 2615   * Put page on the "free" list.
2627 2616   * The free list is really two lists maintained by
2628 2617   * the PSM of whatever machine we happen to be on.
2629 2618   */
2630 2619  void
2631 2620  page_free(page_t *pp, int dontneed)
2632 2621  {
2633 2622          struct pcf      *p;
2634 2623          uint_t          pcf_index;
2635 2624  
2636 2625          ASSERT((PAGE_EXCL(pp) &&
2637 2626              !page_iolock_assert(pp)) || panicstr);
2638 2627  
2639 2628          if (PP_ISFREE(pp)) {
2640 2629                  panic("page_free: page %p is free", (void *)pp);
2641 2630          }
2642 2631  
2643 2632          if (pp->p_szc != 0) {
2644 2633                  if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2645 2634                      PP_ISKAS(pp)) {
2646 2635                          panic("page_free: anon or kernel "
2647 2636                              "or no vnode large page %p", (void *)pp);
2648 2637                  }
2649 2638                  page_demote_vp_pages(pp);
2650 2639                  ASSERT(pp->p_szc == 0);
2651 2640          }
2652 2641  
2653 2642          /*
2654 2643           * The page_struct_lock need not be acquired to examine these
2655 2644           * fields since the page has an "exclusive" lock.
2656 2645           */
2657 2646          if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2658 2647              pp->p_slckcnt != 0) {
2659 2648                  panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2660 2649                      "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2661 2650                      pp->p_cowcnt, pp->p_slckcnt);
2662 2651                  /*NOTREACHED*/
2663 2652          }
2664 2653  
2665 2654          ASSERT(!hat_page_getshare(pp));
2666 2655  
2667 2656          PP_SETFREE(pp);
2668 2657          ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2669 2658              !hat_ismod(pp));
2670 2659          page_clr_all_props(pp);
2671 2660          ASSERT(!hat_page_getshare(pp));
2672 2661  
2673 2662          /*
2674 2663           * Now we add the page to the head of the free list.
2675 2664           * But if this page is associated with a paged vnode
2676 2665           * then we adjust the head forward so that the page is
2677 2666           * effectively at the end of the list.
2678 2667           */
2679 2668          if (pp->p_vnode == NULL) {
2680 2669                  /*
2681 2670                   * Page has no identity, put it on the free list.
2682 2671                   */
2683 2672                  PP_SETAGED(pp);
2684 2673                  pp->p_offset = (u_offset_t)-1;
2685 2674                  page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2686 2675                  VM_STAT_ADD(pagecnt.pc_free_free);
2687 2676                  TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2688 2677                      "page_free_free:pp %p", pp);
2689 2678          } else {
2690 2679                  PP_CLRAGED(pp);
2691 2680  
2692 2681                  if (!dontneed) {
2693 2682                          /* move it to the tail of the list */
2694 2683                          page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2695 2684  
2696 2685                          VM_STAT_ADD(pagecnt.pc_free_cache);
2697 2686                          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2698 2687                              "page_free_cache_tail:pp %p", pp);
2699 2688                  } else {
2700 2689                          page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2701 2690  
2702 2691                          VM_STAT_ADD(pagecnt.pc_free_dontneed);
2703 2692                          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2704 2693                              "page_free_cache_head:pp %p", pp);
2705 2694                  }
2706 2695          }
2707 2696          page_unlock(pp);
2708 2697  
2709 2698          /*
2710 2699           * Now do the `freemem' accounting.
2711 2700           */
2712 2701          pcf_index = PCF_INDEX();
2713 2702          p = &pcf[pcf_index];
2714 2703  
2715 2704          mutex_enter(&p->pcf_lock);
2716 2705          if (p->pcf_block) {
2717 2706                  p->pcf_reserve += 1;
2718 2707          } else {
2719 2708                  p->pcf_count += 1;
2720 2709                  if (p->pcf_wait) {
2721 2710                          mutex_enter(&new_freemem_lock);
2722 2711                          /*
2723 2712                           * Check to see if some other thread
2724 2713                           * is actually waiting.  Another bucket
2725 2714                           * may have woken it up by now.  If there
2726 2715                           * are no waiters, then set our pcf_wait
2727 2716                           * count to zero to avoid coming in here
2728 2717                           * next time.  Also, since only one page
2729 2718                           * was put on the free list, just wake
2730 2719                           * up one waiter.
2731 2720                           */
2732 2721                          if (freemem_wait) {
2733 2722                                  cv_signal(&freemem_cv);
2734 2723                                  p->pcf_wait--;
2735 2724                          } else {
2736 2725                                  p->pcf_wait = 0;
2737 2726                          }
2738 2727                          mutex_exit(&new_freemem_lock);
2739 2728                  }
2740 2729          }
2741 2730          mutex_exit(&p->pcf_lock);
2742 2731  
2743 2732          /* freemem is approximate, so this test OK */
2744 2733          if (!p->pcf_block)
2745 2734                  freemem += 1;
2746 2735  }
2747 2736  
2748 2737  /*
2749 2738   * Put page on the "free" list during intial startup.
2750 2739   * This happens during initial single threaded execution.
2751 2740   */
2752 2741  void
2753 2742  page_free_at_startup(page_t *pp)
2754 2743  {
2755 2744          struct pcf      *p;
2756 2745          uint_t          pcf_index;
2757 2746  
2758 2747          page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2759 2748          VM_STAT_ADD(pagecnt.pc_free_free);
2760 2749  
2761 2750          /*
2762 2751           * Now do the `freemem' accounting.
2763 2752           */
2764 2753          pcf_index = PCF_INDEX();
2765 2754          p = &pcf[pcf_index];
2766 2755  
2767 2756          ASSERT(p->pcf_block == 0);
2768 2757          ASSERT(p->pcf_wait == 0);
2769 2758          p->pcf_count += 1;
2770 2759  
2771 2760          /* freemem is approximate, so this is OK */
2772 2761          freemem += 1;
2773 2762  }
2774 2763  
2775 2764  void
2776 2765  page_free_pages(page_t *pp)
2777 2766  {
2778 2767          page_t  *tpp, *rootpp = NULL;
2779 2768          pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2780 2769          pgcnt_t i;
2781 2770          uint_t  szc = pp->p_szc;
2782 2771  
2783 2772          VM_STAT_ADD(pagecnt.pc_free_pages);
2784 2773          TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2785 2774              "page_free_free:pp %p", pp);
2786 2775  
2787 2776          ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2788 2777          if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2789 2778                  panic("page_free_pages: not root page %p", (void *)pp);
2790 2779                  /*NOTREACHED*/
2791 2780          }
2792 2781  
2793 2782          for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2794 2783                  ASSERT((PAGE_EXCL(tpp) &&
2795 2784                      !page_iolock_assert(tpp)) || panicstr);
2796 2785                  if (PP_ISFREE(tpp)) {
2797 2786                          panic("page_free_pages: page %p is free", (void *)tpp);
2798 2787                          /*NOTREACHED*/
2799 2788                  }
2800 2789                  if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2801 2790                      tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2802 2791                          panic("page_free_pages %p", (void *)tpp);
2803 2792                          /*NOTREACHED*/
2804 2793                  }
2805 2794  
2806 2795                  ASSERT(!hat_page_getshare(tpp));
2807 2796                  ASSERT(tpp->p_vnode == NULL);
2808 2797                  ASSERT(tpp->p_szc == szc);
2809 2798  
2810 2799                  PP_SETFREE(tpp);
2811 2800                  page_clr_all_props(tpp);
2812 2801                  PP_SETAGED(tpp);
2813 2802                  tpp->p_offset = (u_offset_t)-1;
2814 2803                  ASSERT(tpp->p_next == tpp);
2815 2804                  ASSERT(tpp->p_prev == tpp);
2816 2805                  page_list_concat(&rootpp, &tpp);
2817 2806          }
2818 2807          ASSERT(rootpp == pp);
2819 2808  
2820 2809          page_list_add_pages(rootpp, 0);
2821 2810          page_create_putback(pgcnt);
2822 2811  }
2823 2812  
2824 2813  int free_pages = 1;
2825 2814  
2826 2815  /*
2827 2816   * This routine attempts to return pages to the cachelist via page_release().
2828 2817   * It does not *have* to be successful in all cases, since the pageout scanner
2829 2818   * will catch any pages it misses.  It does need to be fast and not introduce
2830 2819   * too much overhead.
2831 2820   *
2832 2821   * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2833 2822   * don't lock and retry.  This is ok, since the page scanner will eventually
2834 2823   * find any page we miss in free_vp_pages().
2835 2824   */
2836 2825  void
2837 2826  free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2838 2827  {
2839 2828          page_t *pp;
2840 2829          u_offset_t eoff;
2841 2830          extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2842 2831  
2843 2832          eoff = off + len;
2844 2833  
2845 2834          if (free_pages == 0)
2846 2835                  return;
2847 2836          if (swap_in_range(vp, off, len))
2848 2837                  return;
2849 2838  
2850 2839          for (; off < eoff; off += PAGESIZE) {
2851 2840  
2852 2841                  /*
2853 2842                   * find the page using a fast, but inexact search. It'll be OK
2854 2843                   * if a few pages slip through the cracks here.
2855 2844                   */
2856 2845                  pp = page_exists(vp, off);
2857 2846  
2858 2847                  /*
2859 2848                   * If we didn't find the page (it may not exist), the page
2860 2849                   * is free, looks still in use (shared), or we can't lock it,
2861 2850                   * just give up.
2862 2851                   */
2863 2852                  if (pp == NULL ||
2864 2853                      PP_ISFREE(pp) ||
2865 2854                      page_share_cnt(pp) > 0 ||
2866 2855                      !page_trylock(pp, SE_EXCL))
2867 2856                          continue;
2868 2857  
2869 2858                  /*
2870 2859                   * Once we have locked pp, verify that it's still the
2871 2860                   * correct page and not already free
2872 2861                   */
2873 2862                  ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2874 2863                  if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2875 2864                          page_unlock(pp);
2876 2865                          continue;
2877 2866                  }
2878 2867  
2879 2868                  /*
2880 2869                   * try to release the page...
2881 2870                   */
2882 2871                  (void) page_release(pp, 1);
2883 2872          }
2884 2873  }
2885 2874  
2886 2875  /*
2887 2876   * Reclaim the given page from the free list.
2888 2877   * If pp is part of a large pages, only the given constituent page is reclaimed
2889 2878   * and the large page it belonged to will be demoted.  This can only happen
2890 2879   * if the page is not on the cachelist.
2891 2880   *
2892 2881   * Returns 1 on success or 0 on failure.
2893 2882   *
2894 2883   * The page is unlocked if it can't be reclaimed (when freemem == 0).
2895 2884   * If `lock' is non-null, it will be dropped and re-acquired if
2896 2885   * the routine must wait while freemem is 0.
2897 2886   *
2898 2887   * As it turns out, boot_getpages() does this.  It picks a page,
2899 2888   * based on where OBP mapped in some address, gets its pfn, searches
2900 2889   * the memsegs, locks the page, then pulls it off the free list!
2901 2890   */
2902 2891  int
2903 2892  page_reclaim(page_t *pp, kmutex_t *lock)
2904 2893  {
2905 2894          struct pcf      *p;
2906 2895          struct cpu      *cpup;
2907 2896          int             enough;
2908 2897          uint_t          i;
2909 2898  
2910 2899          ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2911 2900          ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2912 2901  
2913 2902          /*
2914 2903           * If `freemem' is 0, we cannot reclaim this page from the
2915 2904           * freelist, so release every lock we might hold: the page,
2916 2905           * and the `lock' before blocking.
2917 2906           *
2918 2907           * The only way `freemem' can become 0 while there are pages
2919 2908           * marked free (have their p->p_free bit set) is when the
2920 2909           * system is low on memory and doing a page_create().  In
2921 2910           * order to guarantee that once page_create() starts acquiring
2922 2911           * pages it will be able to get all that it needs since `freemem'
2923 2912           * was decreased by the requested amount.  So, we need to release
2924 2913           * this page, and let page_create() have it.
2925 2914           *
2926 2915           * Since `freemem' being zero is not supposed to happen, just
2927 2916           * use the usual hash stuff as a starting point.  If that bucket
2928 2917           * is empty, then assume the worst, and start at the beginning
2929 2918           * of the pcf array.  If we always start at the beginning
2930 2919           * when acquiring more than one pcf lock, there won't be any
2931 2920           * deadlock problems.
2932 2921           */
2933 2922  
2934 2923          /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2935 2924  
2936 2925          if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2937 2926                  pcf_acquire_all();
2938 2927                  goto page_reclaim_nomem;
2939 2928          }
2940 2929  
2941 2930          enough = pcf_decrement_bucket(1);
2942 2931  
2943 2932          if (!enough) {
2944 2933                  VM_STAT_ADD(page_reclaim_zero);
2945 2934                  /*
2946 2935                   * Check again. Its possible that some other thread
2947 2936                   * could have been right behind us, and added one
2948 2937                   * to a list somewhere.  Acquire each of the pcf locks
2949 2938                   * until we find a page.
2950 2939                   */
2951 2940                  p = pcf;
2952 2941                  for (i = 0; i < pcf_fanout; i++) {
2953 2942                          mutex_enter(&p->pcf_lock);
2954 2943                          if (p->pcf_count >= 1) {
2955 2944                                  p->pcf_count -= 1;
2956 2945                                  /*
2957 2946                                   * freemem is not protected by any lock. Thus,
2958 2947                                   * we cannot have any assertion containing
2959 2948                                   * freemem here.
2960 2949                                   */
2961 2950                                  freemem -= 1;
2962 2951                                  enough = 1;
2963 2952                                  break;
2964 2953                          }
2965 2954                          p++;
2966 2955                  }
2967 2956  
2968 2957                  if (!enough) {
2969 2958  page_reclaim_nomem:
2970 2959                          /*
2971 2960                           * We really can't have page `pp'.
2972 2961                           * Time for the no-memory dance with
2973 2962                           * page_free().  This is just like
2974 2963                           * page_create_wait().  Plus the added
2975 2964                           * attraction of releasing whatever mutex
2976 2965                           * we held when we were called with in `lock'.
2977 2966                           * Page_unlock() will wakeup any thread
2978 2967                           * waiting around for this page.
2979 2968                           */
2980 2969                          if (lock) {
2981 2970                                  VM_STAT_ADD(page_reclaim_zero_locked);
2982 2971                                  mutex_exit(lock);
2983 2972                          }
2984 2973                          page_unlock(pp);
2985 2974  
2986 2975                          /*
2987 2976                           * get this before we drop all the pcf locks.
2988 2977                           */
2989 2978                          mutex_enter(&new_freemem_lock);
2990 2979  
2991 2980                          p = pcf;
2992 2981                          for (i = 0; i < pcf_fanout; i++) {
2993 2982                                  p->pcf_wait++;
2994 2983                                  mutex_exit(&p->pcf_lock);
2995 2984                                  p++;
2996 2985                          }
2997 2986  
2998 2987                          freemem_wait++;
2999 2988                          cv_wait(&freemem_cv, &new_freemem_lock);
3000 2989                          freemem_wait--;
3001 2990  
3002 2991                          mutex_exit(&new_freemem_lock);
3003 2992  
3004 2993                          if (lock) {
3005 2994                                  mutex_enter(lock);
3006 2995                          }
3007 2996                          return (0);
3008 2997                  }
3009 2998  
3010 2999                  /*
3011 3000                   * The pcf accounting has been done,
3012 3001                   * though none of the pcf_wait flags have been set,
3013 3002                   * drop the locks and continue on.
3014 3003                   */
3015 3004                  while (p >= pcf) {
3016 3005                          mutex_exit(&p->pcf_lock);
3017 3006                          p--;
3018 3007                  }
3019 3008          }
3020 3009  
3021 3010  
3022 3011          VM_STAT_ADD(pagecnt.pc_reclaim);
3023 3012  
3024 3013          /*
3025 3014           * page_list_sub will handle the case where pp is a large page.
3026 3015           * It's possible that the page was promoted while on the freelist
3027 3016           */
3028 3017          if (PP_ISAGED(pp)) {
3029 3018                  page_list_sub(pp, PG_FREE_LIST);
3030 3019                  TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3031 3020                      "page_reclaim_free:pp %p", pp);
3032 3021          } else {
3033 3022                  page_list_sub(pp, PG_CACHE_LIST);
3034 3023                  TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3035 3024                      "page_reclaim_cache:pp %p", pp);
3036 3025          }
3037 3026  
3038 3027          /*
3039 3028           * clear the p_free & p_age bits since this page is no longer
3040 3029           * on the free list.  Notice that there was a brief time where
3041 3030           * a page is marked as free, but is not on the list.
3042 3031           *
3043 3032           * Set the reference bit to protect against immediate pageout.
3044 3033           */
3045 3034          PP_CLRFREE(pp);
3046 3035          PP_CLRAGED(pp);
3047 3036          page_set_props(pp, P_REF);
3048 3037  
3049 3038          CPU_STATS_ENTER_K();
3050 3039          cpup = CPU;     /* get cpup now that CPU cannot change */
3051 3040          CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3052 3041          CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3053 3042          CPU_STATS_EXIT_K();
3054 3043          ASSERT(pp->p_szc == 0);
3055 3044  
3056 3045          return (1);
3057 3046  }
3058 3047  
3059 3048  /*
3060 3049   * Destroy identity of the page and put it back on
3061 3050   * the page free list.  Assumes that the caller has
3062 3051   * acquired the "exclusive" lock on the page.
3063 3052   */
3064 3053  void
3065 3054  page_destroy(page_t *pp, int dontfree)
3066 3055  {
3067 3056          ASSERT((PAGE_EXCL(pp) &&
3068 3057              !page_iolock_assert(pp)) || panicstr);
3069 3058          ASSERT(pp->p_slckcnt == 0 || panicstr);
3070 3059  
3071 3060          if (pp->p_szc != 0) {
3072 3061                  if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3073 3062                      PP_ISKAS(pp)) {
3074 3063                          panic("page_destroy: anon or kernel or no vnode "
3075 3064                              "large page %p", (void *)pp);
3076 3065                  }
3077 3066                  page_demote_vp_pages(pp);
3078 3067                  ASSERT(pp->p_szc == 0);
3079 3068          }
3080 3069  
3081 3070          TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3082 3071  
3083 3072          /*
3084 3073           * Unload translations, if any, then hash out the
3085 3074           * page to erase its identity.
3086 3075           */
3087 3076          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3088 3077          page_hashout(pp, NULL);
3089 3078  
3090 3079          if (!dontfree) {
3091 3080                  /*
3092 3081                   * Acquire the "freemem_lock" for availrmem.
3093 3082                   * The page_struct_lock need not be acquired for lckcnt
3094 3083                   * and cowcnt since the page has an "exclusive" lock.
3095 3084                   * We are doing a modified version of page_pp_unlock here.
3096 3085                   */
3097 3086                  if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3098 3087                          mutex_enter(&freemem_lock);
3099 3088                          if (pp->p_lckcnt != 0) {
3100 3089                                  availrmem++;
3101 3090                                  pages_locked--;
3102 3091                                  pp->p_lckcnt = 0;
3103 3092                          }
3104 3093                          if (pp->p_cowcnt != 0) {
3105 3094                                  availrmem += pp->p_cowcnt;
3106 3095                                  pages_locked -= pp->p_cowcnt;
3107 3096                                  pp->p_cowcnt = 0;
3108 3097                          }
3109 3098                          mutex_exit(&freemem_lock);
3110 3099                  }
3111 3100                  /*
3112 3101                   * Put the page on the "free" list.
3113 3102                   */
3114 3103                  page_free(pp, 0);
3115 3104          }
3116 3105  }
3117 3106  
3118 3107  void
3119 3108  page_destroy_pages(page_t *pp)
3120 3109  {
3121 3110  
3122 3111          page_t  *tpp, *rootpp = NULL;
3123 3112          pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3124 3113          pgcnt_t i, pglcks = 0;
3125 3114          uint_t  szc = pp->p_szc;
3126 3115  
3127 3116          ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3128 3117  
3129 3118          VM_STAT_ADD(pagecnt.pc_destroy_pages);
3130 3119  
3131 3120          TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3132 3121  
3133 3122          if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3134 3123                  panic("page_destroy_pages: not root page %p", (void *)pp);
3135 3124                  /*NOTREACHED*/
3136 3125          }
3137 3126  
3138 3127          for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3139 3128                  ASSERT((PAGE_EXCL(tpp) &&
3140 3129                      !page_iolock_assert(tpp)) || panicstr);
3141 3130                  ASSERT(tpp->p_slckcnt == 0 || panicstr);
3142 3131                  (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3143 3132                  page_hashout(tpp, NULL);
3144 3133                  ASSERT(tpp->p_offset == (u_offset_t)-1);
3145 3134                  if (tpp->p_lckcnt != 0) {
3146 3135                          pglcks++;
3147 3136                          tpp->p_lckcnt = 0;
3148 3137                  } else if (tpp->p_cowcnt != 0) {
3149 3138                          pglcks += tpp->p_cowcnt;
3150 3139                          tpp->p_cowcnt = 0;
3151 3140                  }
3152 3141                  ASSERT(!hat_page_getshare(tpp));
3153 3142                  ASSERT(tpp->p_vnode == NULL);
3154 3143                  ASSERT(tpp->p_szc == szc);
3155 3144  
3156 3145                  PP_SETFREE(tpp);
3157 3146                  page_clr_all_props(tpp);
3158 3147                  PP_SETAGED(tpp);
3159 3148                  ASSERT(tpp->p_next == tpp);
3160 3149                  ASSERT(tpp->p_prev == tpp);
3161 3150                  page_list_concat(&rootpp, &tpp);
3162 3151          }
3163 3152  
3164 3153          ASSERT(rootpp == pp);
3165 3154          if (pglcks != 0) {
3166 3155                  mutex_enter(&freemem_lock);
3167 3156                  availrmem += pglcks;
3168 3157                  mutex_exit(&freemem_lock);
3169 3158          }
3170 3159  
3171 3160          page_list_add_pages(rootpp, 0);
3172 3161          page_create_putback(pgcnt);
3173 3162  }
3174 3163  
3175 3164  /*
3176 3165   * Similar to page_destroy(), but destroys pages which are
3177 3166   * locked and known to be on the page free list.  Since
3178 3167   * the page is known to be free and locked, no one can access
3179 3168   * it.
3180 3169   *
3181 3170   * Also, the number of free pages does not change.
3182 3171   */
3183 3172  void
3184 3173  page_destroy_free(page_t *pp)
3185 3174  {
3186 3175          ASSERT(PAGE_EXCL(pp));
3187 3176          ASSERT(PP_ISFREE(pp));
3188 3177          ASSERT(pp->p_vnode);
3189 3178          ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3190 3179          ASSERT(!hat_page_is_mapped(pp));
3191 3180          ASSERT(PP_ISAGED(pp) == 0);
3192 3181          ASSERT(pp->p_szc == 0);
3193 3182  
3194 3183          VM_STAT_ADD(pagecnt.pc_destroy_free);
3195 3184          page_list_sub(pp, PG_CACHE_LIST);
3196 3185  
3197 3186          page_hashout(pp, NULL);
3198 3187          ASSERT(pp->p_vnode == NULL);
3199 3188          ASSERT(pp->p_offset == (u_offset_t)-1);
3200 3189          ASSERT(pp->p_hash == NULL);
3201 3190  
3202 3191          PP_SETAGED(pp);
3203 3192          page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3204 3193          page_unlock(pp);
3205 3194  
3206 3195          mutex_enter(&new_freemem_lock);
3207 3196          if (freemem_wait) {
3208 3197                  cv_signal(&freemem_cv);
3209 3198          }
3210 3199          mutex_exit(&new_freemem_lock);
3211 3200  }
3212 3201  
3213 3202  /*
3214 3203   * Rename the page "opp" to have an identity specified
3215 3204   * by [vp, off].  If a page already exists with this name
3216 3205   * it is locked and destroyed.  Note that the page's
3217 3206   * translations are not unloaded during the rename.
3218 3207   *
3219 3208   * This routine is used by the anon layer to "steal" the
3220 3209   * original page and is not unlike destroying a page and
3221 3210   * creating a new page using the same page frame.
3222 3211   *
3223 3212   * XXX -- Could deadlock if caller 1 tries to rename A to B while
3224 3213   * caller 2 tries to rename B to A.
3225 3214   */
3226 3215  void
3227 3216  page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3228 3217  {
3229 3218          page_t          *pp;
3230 3219          int             olckcnt = 0;
3231 3220          int             ocowcnt = 0;
3232 3221          kmutex_t        *phm;
3233 3222          ulong_t         index;
3234 3223  
3235 3224          ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3236 3225          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3237 3226          ASSERT(PP_ISFREE(opp) == 0);
3238 3227  
3239 3228          VM_STAT_ADD(page_rename_count);
3240 3229  
3241 3230          TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3242 3231              "page rename:pp %p vp %p off %llx", opp, vp, off);
3243 3232  
3244 3233          /*
3245 3234           * CacheFS may call page_rename for a large NFS page
3246 3235           * when both CacheFS and NFS mount points are used
3247 3236           * by applications. Demote this large page before
3248 3237           * renaming it, to ensure that there are no "partial"
3249 3238           * large pages left lying around.
3250 3239           */
3251 3240          if (opp->p_szc != 0) {
3252 3241                  vnode_t *ovp = opp->p_vnode;
3253 3242                  ASSERT(ovp != NULL);
3254 3243                  ASSERT(!IS_SWAPFSVP(ovp));
3255 3244                  ASSERT(!VN_ISKAS(ovp));
3256 3245                  page_demote_vp_pages(opp);
3257 3246                  ASSERT(opp->p_szc == 0);
3258 3247          }
3259 3248  
3260 3249          page_hashout(opp, NULL);
3261 3250          PP_CLRAGED(opp);
3262 3251  
3263 3252          /*
3264 3253           * Acquire the appropriate page hash lock, since
3265 3254           * we're going to rename the page.
3266 3255           */
3267 3256          index = PAGE_HASH_FUNC(vp, off);
3268 3257          phm = PAGE_HASH_MUTEX(index);
3269 3258          mutex_enter(phm);

↓ open down ↓

809 lines elided

↑ open up ↑

3270 3259  top:
3271 3260          /*
3272 3261           * Look for an existing page with this name and destroy it if found.
3273 3262           * By holding the page hash lock all the way to the page_hashin()
3274 3263           * call, we are assured that no page can be created with this
3275 3264           * identity.  In the case when the phm lock is dropped to undo any
3276 3265           * hat layer mappings, the existing page is held with an "exclusive"
3277 3266           * lock, again preventing another page from being created with
3278 3267           * this identity.
3279 3268           */
3280      -        PAGE_HASH_SEARCH(index, pp, vp, off);
     3269 +        pp = page_hash_search(index, vp, off);
3281 3270          if (pp != NULL) {
3282 3271                  VM_STAT_ADD(page_rename_exists);
3283 3272  
3284 3273                  /*
3285 3274                   * As it turns out, this is one of only two places where
3286 3275                   * page_lock() needs to hold the passed in lock in the
3287 3276                   * successful case.  In all of the others, the lock could
3288 3277                   * be dropped as soon as the attempt is made to lock
3289 3278                   * the page.  It is tempting to add yet another arguement,
3290 3279                   * PL_KEEP or PL_DROP, to let page_lock know what to do.

3291 3280                   */
3292 3281                  if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3293 3282                          /*
3294 3283                           * Went to sleep because the page could not
3295 3284                           * be locked.  We were woken up when the page
3296 3285                           * was unlocked, or when the page was destroyed.
3297 3286                           * In either case, `phm' was dropped while we
3298 3287                           * slept.  Hence we should not just roar through
3299 3288                           * this loop.
3300 3289                           */
3301 3290                          goto top;
3302 3291                  }
3303 3292  
3304 3293                  /*
3305 3294                   * If an existing page is a large page, then demote
3306 3295                   * it to ensure that no "partial" large pages are
3307 3296                   * "created" after page_rename. An existing page
3308 3297                   * can be a CacheFS page, and can't belong to swapfs.
3309 3298                   */
3310 3299                  if (hat_page_is_mapped(pp)) {
3311 3300                          /*
3312 3301                           * Unload translations.  Since we hold the
3313 3302                           * exclusive lock on this page, the page
3314 3303                           * can not be changed while we drop phm.
3315 3304                           * This is also not a lock protocol violation,
3316 3305                           * but rather the proper way to do things.
3317 3306                           */
3318 3307                          mutex_exit(phm);
3319 3308                          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3320 3309                          if (pp->p_szc != 0) {
3321 3310                                  ASSERT(!IS_SWAPFSVP(vp));
3322 3311                                  ASSERT(!VN_ISKAS(vp));
3323 3312                                  page_demote_vp_pages(pp);
3324 3313                                  ASSERT(pp->p_szc == 0);
3325 3314                          }
3326 3315                          mutex_enter(phm);
3327 3316                  } else if (pp->p_szc != 0) {
3328 3317                          ASSERT(!IS_SWAPFSVP(vp));
3329 3318                          ASSERT(!VN_ISKAS(vp));
3330 3319                          mutex_exit(phm);
3331 3320                          page_demote_vp_pages(pp);
3332 3321                          ASSERT(pp->p_szc == 0);
3333 3322                          mutex_enter(phm);
3334 3323                  }
3335 3324                  page_hashout(pp, phm);
3336 3325          }
3337 3326          /*
3338 3327           * Hash in the page with the new identity.
3339 3328           */
3340 3329          if (!page_hashin(opp, vp, off, phm)) {
3341 3330                  /*
3342 3331                   * We were holding phm while we searched for [vp, off]
3343 3332                   * and only dropped phm if we found and locked a page.
3344 3333                   * If we can't create this page now, then some thing
3345 3334                   * is really broken.
3346 3335                   */
3347 3336                  panic("page_rename: Can't hash in page: %p", (void *)pp);
3348 3337                  /*NOTREACHED*/
3349 3338          }
3350 3339  
3351 3340          ASSERT(MUTEX_HELD(phm));
3352 3341          mutex_exit(phm);
3353 3342  
3354 3343          /*
3355 3344           * Now that we have dropped phm, lets get around to finishing up
3356 3345           * with pp.
3357 3346           */
3358 3347          if (pp != NULL) {
3359 3348                  ASSERT(!hat_page_is_mapped(pp));
3360 3349                  /* for now large pages should not end up here */
3361 3350                  ASSERT(pp->p_szc == 0);
3362 3351                  /*
3363 3352                   * Save the locks for transfer to the new page and then
3364 3353                   * clear them so page_free doesn't think they're important.
3365 3354                   * The page_struct_lock need not be acquired for lckcnt and
3366 3355                   * cowcnt since the page has an "exclusive" lock.
3367 3356                   */
3368 3357                  olckcnt = pp->p_lckcnt;
3369 3358                  ocowcnt = pp->p_cowcnt;
3370 3359                  pp->p_lckcnt = pp->p_cowcnt = 0;
3371 3360  
3372 3361                  /*
3373 3362                   * Put the page on the "free" list after we drop
3374 3363                   * the lock.  The less work under the lock the better.
3375 3364                   */
3376 3365                  /*LINTED: constant in conditional context*/
3377 3366                  VN_DISPOSE(pp, B_FREE, 0, kcred);
3378 3367          }
3379 3368  
3380 3369          /*
3381 3370           * Transfer the lock count from the old page (if any).
3382 3371           * The page_struct_lock need not be acquired for lckcnt and
3383 3372           * cowcnt since the page has an "exclusive" lock.
3384 3373           */
3385 3374          opp->p_lckcnt += olckcnt;
3386 3375          opp->p_cowcnt += ocowcnt;
3387 3376  }
3388 3377  
3389 3378  /*
3390 3379   * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3391 3380   *
3392 3381   * Pages are normally inserted at the start of a vnode's v_pages list.
3393 3382   * If the vnode is VMODSORT and the page is modified, it goes at the end.
3394 3383   * This can happen when a modified page is relocated for DR.
3395 3384   *
3396 3385   * Returns 1 on success and 0 on failure.
3397 3386   */
3398 3387  static int
3399 3388  page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3400 3389  {
3401 3390          page_t          **listp;
3402 3391          page_t          *tp;
3403 3392          ulong_t         index;
3404 3393  
3405 3394          ASSERT(PAGE_EXCL(pp));
3406 3395          ASSERT(vp != NULL);
3407 3396          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3408 3397  
3409 3398          /*
3410 3399           * Be sure to set these up before the page is inserted on the hash
3411 3400           * list.  As soon as the page is placed on the list some other
3412 3401           * thread might get confused and wonder how this page could
3413 3402           * possibly hash to this list.
3414 3403           */
3415 3404          pp->p_vnode = vp;
3416 3405          pp->p_offset = offset;
3417 3406  
3418 3407          /*
3419 3408           * record if this page is on a swap vnode
3420 3409           */
3421 3410          if ((vp->v_flag & VISSWAP) != 0)
3422 3411                  PP_SETSWAP(pp);
3423 3412  
3424 3413          index = PAGE_HASH_FUNC(vp, offset);
3425 3414          ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3426 3415          listp = &page_hash[index];
3427 3416  
3428 3417          /*
3429 3418           * If this page is already hashed in, fail this attempt to add it.
3430 3419           */
3431 3420          for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3432 3421                  if (tp->p_vnode == vp && tp->p_offset == offset) {
3433 3422                          pp->p_vnode = NULL;
3434 3423                          pp->p_offset = (u_offset_t)(-1);
3435 3424                          return (0);
3436 3425                  }
3437 3426          }
3438 3427          pp->p_hash = *listp;
3439 3428          *listp = pp;
3440 3429  
3441 3430          /*
3442 3431           * Add the page to the vnode's list of pages
3443 3432           */
3444 3433          if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3445 3434                  listp = &vp->v_pages->p_vpprev->p_vpnext;
3446 3435          else
3447 3436                  listp = &vp->v_pages;
3448 3437  
3449 3438          page_vpadd(listp, pp);
3450 3439  
3451 3440          return (1);
3452 3441  }
3453 3442  
3454 3443  /*
3455 3444   * Add page `pp' to both the hash and vp chains for [vp, offset].
3456 3445   *
3457 3446   * Returns 1 on success and 0 on failure.
3458 3447   * If hold is passed in, it is not dropped.
3459 3448   */
3460 3449  int
3461 3450  page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3462 3451  {
3463 3452          kmutex_t        *phm = NULL;
3464 3453          kmutex_t        *vphm;
3465 3454          int             rc;
3466 3455  
3467 3456          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3468 3457          ASSERT(pp->p_fsdata == 0 || panicstr);
3469 3458  
3470 3459          TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3471 3460              "page_hashin:pp %p vp %p offset %llx",
3472 3461              pp, vp, offset);
3473 3462  
3474 3463          VM_STAT_ADD(hashin_count);
3475 3464  
3476 3465          if (hold != NULL)
3477 3466                  phm = hold;
3478 3467          else {
3479 3468                  VM_STAT_ADD(hashin_not_held);
3480 3469                  phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3481 3470                  mutex_enter(phm);
3482 3471          }
3483 3472  
3484 3473          vphm = page_vnode_mutex(vp);
3485 3474          mutex_enter(vphm);
3486 3475          rc = page_do_hashin(pp, vp, offset);
3487 3476          mutex_exit(vphm);
3488 3477          if (hold == NULL)
3489 3478                  mutex_exit(phm);
3490 3479          if (rc == 0)
3491 3480                  VM_STAT_ADD(hashin_already);
3492 3481          return (rc);
3493 3482  }
3494 3483  
3495 3484  /*
3496 3485   * Remove page ``pp'' from the hash and vp chains and remove vp association.
3497 3486   * All mutexes must be held
3498 3487   */
3499 3488  static void
3500 3489  page_do_hashout(page_t *pp)
3501 3490  {
3502 3491          page_t  **hpp;
3503 3492          page_t  *hp;
3504 3493          vnode_t *vp = pp->p_vnode;
3505 3494  
3506 3495          ASSERT(vp != NULL);
3507 3496          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3508 3497  
3509 3498          /*
3510 3499           * First, take pp off of its hash chain.
3511 3500           */
3512 3501          hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3513 3502  
3514 3503          for (;;) {
3515 3504                  hp = *hpp;
3516 3505                  if (hp == pp)
3517 3506                          break;
3518 3507                  if (hp == NULL) {
3519 3508                          panic("page_do_hashout");
3520 3509                          /*NOTREACHED*/
3521 3510                  }
3522 3511                  hpp = &hp->p_hash;
3523 3512          }
3524 3513          *hpp = pp->p_hash;
3525 3514  
3526 3515          /*
3527 3516           * Now remove it from its associated vnode.
3528 3517           */
3529 3518          if (vp->v_pages)
3530 3519                  page_vpsub(&vp->v_pages, pp);
3531 3520  
3532 3521          pp->p_hash = NULL;
3533 3522          page_clr_all_props(pp);
3534 3523          PP_CLRSWAP(pp);
3535 3524          pp->p_vnode = NULL;
3536 3525          pp->p_offset = (u_offset_t)-1;
3537 3526          pp->p_fsdata = 0;
3538 3527  }
3539 3528  
3540 3529  /*
3541 3530   * Remove page ``pp'' from the hash and vp chains and remove vp association.
3542 3531   *
3543 3532   * When `phm' is non-NULL it contains the address of the mutex protecting the
3544 3533   * hash list pp is on.  It is not dropped.
3545 3534   */
3546 3535  void
3547 3536  page_hashout(page_t *pp, kmutex_t *phm)
3548 3537  {
3549 3538          vnode_t         *vp;
3550 3539          ulong_t         index;
3551 3540          kmutex_t        *nphm;
3552 3541          kmutex_t        *vphm;
3553 3542          kmutex_t        *sep;
3554 3543  
3555 3544          ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3556 3545          ASSERT(pp->p_vnode != NULL);
3557 3546          ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3558 3547          ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3559 3548  
3560 3549          vp = pp->p_vnode;
3561 3550  
3562 3551          TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3563 3552              "page_hashout:pp %p vp %p", pp, vp);
3564 3553  
3565 3554          /* Kernel probe */
3566 3555          TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3567 3556              tnf_opaque, vnode, vp,
3568 3557              tnf_offset, offset, pp->p_offset);
3569 3558  
3570 3559          /*
3571 3560           *
3572 3561           */
3573 3562          VM_STAT_ADD(hashout_count);
3574 3563          index = PAGE_HASH_FUNC(vp, pp->p_offset);
3575 3564          if (phm == NULL) {
3576 3565                  VM_STAT_ADD(hashout_not_held);
3577 3566                  nphm = PAGE_HASH_MUTEX(index);
3578 3567                  mutex_enter(nphm);
3579 3568          }
3580 3569          ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3581 3570  
3582 3571  
3583 3572          /*
3584 3573           * grab page vnode mutex and remove it...
3585 3574           */
3586 3575          vphm = page_vnode_mutex(vp);
3587 3576          mutex_enter(vphm);
3588 3577  
3589 3578          page_do_hashout(pp);
3590 3579  
3591 3580          mutex_exit(vphm);
3592 3581          if (phm == NULL)
3593 3582                  mutex_exit(nphm);
3594 3583  
3595 3584          /*
3596 3585           * Wake up processes waiting for this page.  The page's
3597 3586           * identity has been changed, and is probably not the
3598 3587           * desired page any longer.
3599 3588           */
3600 3589          sep = page_se_mutex(pp);
3601 3590          mutex_enter(sep);
3602 3591          pp->p_selock &= ~SE_EWANTED;
3603 3592          if (CV_HAS_WAITERS(&pp->p_cv))
3604 3593                  cv_broadcast(&pp->p_cv);
3605 3594          mutex_exit(sep);
3606 3595  }
3607 3596  
3608 3597  /*
3609 3598   * Add the page to the front of a linked list of pages
3610 3599   * using the p_next & p_prev pointers for the list.
3611 3600   * The caller is responsible for protecting the list pointers.
3612 3601   */
3613 3602  void
3614 3603  page_add(page_t **ppp, page_t *pp)
3615 3604  {
3616 3605          ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3617 3606  
3618 3607          page_add_common(ppp, pp);
3619 3608  }
3620 3609  
3621 3610  
3622 3611  
3623 3612  /*
3624 3613   *  Common code for page_add() and mach_page_add()
3625 3614   */
3626 3615  void
3627 3616  page_add_common(page_t **ppp, page_t *pp)
3628 3617  {
3629 3618          if (*ppp == NULL) {
3630 3619                  pp->p_next = pp->p_prev = pp;
3631 3620          } else {
3632 3621                  pp->p_next = *ppp;
3633 3622                  pp->p_prev = (*ppp)->p_prev;
3634 3623                  (*ppp)->p_prev = pp;
3635 3624                  pp->p_prev->p_next = pp;
3636 3625          }
3637 3626          *ppp = pp;
3638 3627  }
3639 3628  
3640 3629  
3641 3630  /*
3642 3631   * Remove this page from a linked list of pages
3643 3632   * using the p_next & p_prev pointers for the list.
3644 3633   *
3645 3634   * The caller is responsible for protecting the list pointers.
3646 3635   */
3647 3636  void
3648 3637  page_sub(page_t **ppp, page_t *pp)
3649 3638  {
3650 3639          ASSERT((PP_ISFREE(pp)) ? 1 :
3651 3640              (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3652 3641  
3653 3642          if (*ppp == NULL || pp == NULL) {
3654 3643                  panic("page_sub: bad arg(s): pp %p, *ppp %p",
3655 3644                      (void *)pp, (void *)(*ppp));
3656 3645                  /*NOTREACHED*/
3657 3646          }
3658 3647  
3659 3648          page_sub_common(ppp, pp);
3660 3649  }
3661 3650  
3662 3651  
3663 3652  /*
3664 3653   *  Common code for page_sub() and mach_page_sub()
3665 3654   */
3666 3655  void
3667 3656  page_sub_common(page_t **ppp, page_t *pp)
3668 3657  {
3669 3658          if (*ppp == pp)
3670 3659                  *ppp = pp->p_next;              /* go to next page */
3671 3660  
3672 3661          if (*ppp == pp)
3673 3662                  *ppp = NULL;                    /* page list is gone */
3674 3663          else {
3675 3664                  pp->p_prev->p_next = pp->p_next;
3676 3665                  pp->p_next->p_prev = pp->p_prev;
3677 3666          }
3678 3667          pp->p_prev = pp->p_next = pp;           /* make pp a list of one */
3679 3668  }
3680 3669  
3681 3670  
3682 3671  /*
3683 3672   * Break page list cppp into two lists with npages in the first list.
3684 3673   * The tail is returned in nppp.
3685 3674   */
3686 3675  void
3687 3676  page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3688 3677  {
3689 3678          page_t *s1pp = *oppp;
3690 3679          page_t *s2pp;
3691 3680          page_t *e1pp, *e2pp;
3692 3681          long n = 0;
3693 3682  
3694 3683          if (s1pp == NULL) {
3695 3684                  *nppp = NULL;
3696 3685                  return;
3697 3686          }
3698 3687          if (npages == 0) {
3699 3688                  *nppp = s1pp;
3700 3689                  *oppp = NULL;
3701 3690                  return;
3702 3691          }
3703 3692          for (n = 0, s2pp = *oppp; n < npages; n++) {
3704 3693                  s2pp = s2pp->p_next;
3705 3694          }
3706 3695          /* Fix head and tail of new lists */
3707 3696          e1pp = s2pp->p_prev;
3708 3697          e2pp = s1pp->p_prev;
3709 3698          s1pp->p_prev = e1pp;
3710 3699          e1pp->p_next = s1pp;
3711 3700          s2pp->p_prev = e2pp;
3712 3701          e2pp->p_next = s2pp;
3713 3702  
3714 3703          /* second list empty */
3715 3704          if (s2pp == s1pp) {
3716 3705                  *oppp = s1pp;
3717 3706                  *nppp = NULL;
3718 3707          } else {
3719 3708                  *oppp = s1pp;
3720 3709                  *nppp = s2pp;
3721 3710          }
3722 3711  }
3723 3712  
3724 3713  /*
3725 3714   * Concatenate page list nppp onto the end of list ppp.
3726 3715   */
3727 3716  void
3728 3717  page_list_concat(page_t **ppp, page_t **nppp)
3729 3718  {
3730 3719          page_t *s1pp, *s2pp, *e1pp, *e2pp;
3731 3720  
3732 3721          if (*nppp == NULL) {
3733 3722                  return;
3734 3723          }
3735 3724          if (*ppp == NULL) {
3736 3725                  *ppp = *nppp;
3737 3726                  return;
3738 3727          }
3739 3728          s1pp = *ppp;
3740 3729          e1pp =  s1pp->p_prev;
3741 3730          s2pp = *nppp;
3742 3731          e2pp = s2pp->p_prev;
3743 3732          s1pp->p_prev = e2pp;
3744 3733          e2pp->p_next = s1pp;
3745 3734          e1pp->p_next = s2pp;
3746 3735          s2pp->p_prev = e1pp;
3747 3736  }
3748 3737  
3749 3738  /*
3750 3739   * return the next page in the page list
3751 3740   */
3752 3741  page_t *
3753 3742  page_list_next(page_t *pp)
3754 3743  {
3755 3744          return (pp->p_next);
3756 3745  }
3757 3746  
3758 3747  
3759 3748  /*
3760 3749   * Add the page to the front of the linked list of pages
3761 3750   * using p_vpnext/p_vpprev pointers for the list.
3762 3751   *
3763 3752   * The caller is responsible for protecting the lists.
3764 3753   */
3765 3754  void
3766 3755  page_vpadd(page_t **ppp, page_t *pp)
3767 3756  {
3768 3757          if (*ppp == NULL) {
3769 3758                  pp->p_vpnext = pp->p_vpprev = pp;
3770 3759          } else {
3771 3760                  pp->p_vpnext = *ppp;
3772 3761                  pp->p_vpprev = (*ppp)->p_vpprev;
3773 3762                  (*ppp)->p_vpprev = pp;
3774 3763                  pp->p_vpprev->p_vpnext = pp;
3775 3764          }
3776 3765          *ppp = pp;
3777 3766  }
3778 3767  
3779 3768  /*
3780 3769   * Remove this page from the linked list of pages
3781 3770   * using p_vpnext/p_vpprev pointers for the list.
3782 3771   *
3783 3772   * The caller is responsible for protecting the lists.
3784 3773   */
3785 3774  void
3786 3775  page_vpsub(page_t **ppp, page_t *pp)
3787 3776  {
3788 3777          if (*ppp == NULL || pp == NULL) {
3789 3778                  panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3790 3779                      (void *)pp, (void *)(*ppp));
3791 3780                  /*NOTREACHED*/
3792 3781          }
3793 3782  
3794 3783          if (*ppp == pp)
3795 3784                  *ppp = pp->p_vpnext;            /* go to next page */
3796 3785  
3797 3786          if (*ppp == pp)
3798 3787                  *ppp = NULL;                    /* page list is gone */
3799 3788          else {
3800 3789                  pp->p_vpprev->p_vpnext = pp->p_vpnext;
3801 3790                  pp->p_vpnext->p_vpprev = pp->p_vpprev;
3802 3791          }
3803 3792          pp->p_vpprev = pp->p_vpnext = pp;       /* make pp a list of one */
3804 3793  }
3805 3794  
3806 3795  /*
3807 3796   * Lock a physical page into memory "long term".  Used to support "lock
3808 3797   * in memory" functions.  Accepts the page to be locked, and a cow variable
3809 3798   * to indicate whether a the lock will travel to the new page during
3810 3799   * a potential copy-on-write.
3811 3800   */
3812 3801  int
3813 3802  page_pp_lock(
3814 3803          page_t *pp,                     /* page to be locked */
3815 3804          int cow,                        /* cow lock */
3816 3805          int kernel)                     /* must succeed -- ignore checking */
3817 3806  {
3818 3807          int r = 0;                      /* result -- assume failure */
3819 3808  
3820 3809          ASSERT(PAGE_LOCKED(pp));
3821 3810  
3822 3811          page_struct_lock(pp);
3823 3812          /*
3824 3813           * Acquire the "freemem_lock" for availrmem.
3825 3814           */
3826 3815          if (cow) {
3827 3816                  mutex_enter(&freemem_lock);
3828 3817                  if ((availrmem > pages_pp_maximum) &&
3829 3818                      (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3830 3819                          availrmem--;
3831 3820                          pages_locked++;
3832 3821                          mutex_exit(&freemem_lock);
3833 3822                          r = 1;
3834 3823                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3835 3824                                  cmn_err(CE_WARN,
3836 3825                                      "COW lock limit reached on pfn 0x%lx",
3837 3826                                      page_pptonum(pp));
3838 3827                          }
3839 3828                  } else
3840 3829                          mutex_exit(&freemem_lock);
3841 3830          } else {
3842 3831                  if (pp->p_lckcnt) {
3843 3832                          if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3844 3833                                  r = 1;
3845 3834                                  if (++pp->p_lckcnt ==
3846 3835                                      (ushort_t)PAGE_LOCK_MAXIMUM) {
3847 3836                                          cmn_err(CE_WARN, "Page lock limit "
3848 3837                                              "reached on pfn 0x%lx",
3849 3838                                              page_pptonum(pp));
3850 3839                                  }
3851 3840                          }
3852 3841                  } else {
3853 3842                          if (kernel) {
3854 3843                                  /* availrmem accounting done by caller */
3855 3844                                  ++pp->p_lckcnt;
3856 3845                                  r = 1;
3857 3846                          } else {
3858 3847                                  mutex_enter(&freemem_lock);
3859 3848                                  if (availrmem > pages_pp_maximum) {
3860 3849                                          availrmem--;
3861 3850                                          pages_locked++;
3862 3851                                          ++pp->p_lckcnt;
3863 3852                                          r = 1;
3864 3853                                  }
3865 3854                                  mutex_exit(&freemem_lock);
3866 3855                          }
3867 3856                  }
3868 3857          }
3869 3858          page_struct_unlock(pp);
3870 3859          return (r);
3871 3860  }
3872 3861  
3873 3862  /*
3874 3863   * Decommit a lock on a physical page frame.  Account for cow locks if
3875 3864   * appropriate.
3876 3865   */
3877 3866  void
3878 3867  page_pp_unlock(
3879 3868          page_t *pp,                     /* page to be unlocked */
3880 3869          int cow,                        /* expect cow lock */
3881 3870          int kernel)                     /* this was a kernel lock */
3882 3871  {
3883 3872          ASSERT(PAGE_LOCKED(pp));
3884 3873  
3885 3874          page_struct_lock(pp);
3886 3875          /*
3887 3876           * Acquire the "freemem_lock" for availrmem.
3888 3877           * If cowcnt or lcknt is already 0 do nothing; i.e., we
3889 3878           * could be called to unlock even if nothing is locked. This could
3890 3879           * happen if locked file pages were truncated (removing the lock)
3891 3880           * and the file was grown again and new pages faulted in; the new
3892 3881           * pages are unlocked but the segment still thinks they're locked.
3893 3882           */
3894 3883          if (cow) {
3895 3884                  if (pp->p_cowcnt) {
3896 3885                          mutex_enter(&freemem_lock);
3897 3886                          pp->p_cowcnt--;
3898 3887                          availrmem++;
3899 3888                          pages_locked--;
3900 3889                          mutex_exit(&freemem_lock);
3901 3890                  }
3902 3891          } else {
3903 3892                  if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3904 3893                          if (!kernel) {
3905 3894                                  mutex_enter(&freemem_lock);
3906 3895                                  availrmem++;
3907 3896                                  pages_locked--;
3908 3897                                  mutex_exit(&freemem_lock);
3909 3898                          }
3910 3899                  }
3911 3900          }
3912 3901          page_struct_unlock(pp);
3913 3902  }
3914 3903  
3915 3904  /*
3916 3905   * This routine reserves availrmem for npages;
3917 3906   *      flags: KM_NOSLEEP or KM_SLEEP
3918 3907   *      returns 1 on success or 0 on failure
3919 3908   */
3920 3909  int
3921 3910  page_resv(pgcnt_t npages, uint_t flags)
3922 3911  {
3923 3912          mutex_enter(&freemem_lock);
3924 3913          while (availrmem < tune.t_minarmem + npages) {
3925 3914                  if (flags & KM_NOSLEEP) {
3926 3915                          mutex_exit(&freemem_lock);
3927 3916                          return (0);
3928 3917                  }
3929 3918                  mutex_exit(&freemem_lock);
3930 3919                  page_needfree(npages);
3931 3920                  kmem_reap();
3932 3921                  delay(hz >> 2);
3933 3922                  page_needfree(-(spgcnt_t)npages);
3934 3923                  mutex_enter(&freemem_lock);
3935 3924          }
3936 3925          availrmem -= npages;
3937 3926          mutex_exit(&freemem_lock);
3938 3927          return (1);
3939 3928  }
3940 3929  
3941 3930  /*
3942 3931   * This routine unreserves availrmem for npages;
3943 3932   */
3944 3933  void
3945 3934  page_unresv(pgcnt_t npages)
3946 3935  {
3947 3936          mutex_enter(&freemem_lock);
3948 3937          availrmem += npages;
3949 3938          mutex_exit(&freemem_lock);
3950 3939  }
3951 3940  
3952 3941  /*
3953 3942   * See Statement at the beginning of segvn_lockop() regarding
3954 3943   * the way we handle cowcnts and lckcnts.
3955 3944   *
3956 3945   * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3957 3946   * that breaks COW has PROT_WRITE.
3958 3947   *
3959 3948   * Note that, we may also break COW in case we are softlocking
3960 3949   * on read access during physio;
3961 3950   * in this softlock case, the vpage may not have PROT_WRITE.
3962 3951   * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3963 3952   * if the vpage doesn't have PROT_WRITE.
3964 3953   *
3965 3954   * This routine is never called if we are stealing a page
3966 3955   * in anon_private.
3967 3956   *
3968 3957   * The caller subtracted from availrmem for read only mapping.
3969 3958   * if lckcnt is 1 increment availrmem.
3970 3959   */
3971 3960  void
3972 3961  page_pp_useclaim(
3973 3962          page_t *opp,            /* original page frame losing lock */
3974 3963          page_t *npp,            /* new page frame gaining lock */
3975 3964          uint_t  write_perm)     /* set if vpage has PROT_WRITE */
3976 3965  {
3977 3966          int payback = 0;
3978 3967          int nidx, oidx;
3979 3968  
3980 3969          ASSERT(PAGE_LOCKED(opp));
3981 3970          ASSERT(PAGE_LOCKED(npp));
3982 3971  
3983 3972          /*
3984 3973           * Since we have two pages we probably have two locks.  We need to take
3985 3974           * them in a defined order to avoid deadlocks.  It's also possible they
3986 3975           * both hash to the same lock in which case this is a non-issue.
3987 3976           */
3988 3977          nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3989 3978          oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3990 3979          if (nidx < oidx) {
3991 3980                  page_struct_lock(npp);
3992 3981                  page_struct_lock(opp);
3993 3982          } else if (oidx < nidx) {
3994 3983                  page_struct_lock(opp);
3995 3984                  page_struct_lock(npp);
3996 3985          } else {        /* The pages hash to the same lock */
3997 3986                  page_struct_lock(npp);
3998 3987          }
3999 3988  
4000 3989          ASSERT(npp->p_cowcnt == 0);
4001 3990          ASSERT(npp->p_lckcnt == 0);
4002 3991  
4003 3992          /* Don't use claim if nothing is locked (see page_pp_unlock above) */
4004 3993          if ((write_perm && opp->p_cowcnt != 0) ||
4005 3994              (!write_perm && opp->p_lckcnt != 0)) {
4006 3995  
4007 3996                  if (write_perm) {
4008 3997                          npp->p_cowcnt++;
4009 3998                          ASSERT(opp->p_cowcnt != 0);
4010 3999                          opp->p_cowcnt--;
4011 4000                  } else {
4012 4001  
4013 4002                          ASSERT(opp->p_lckcnt != 0);
4014 4003  
4015 4004                          /*
4016 4005                           * We didn't need availrmem decremented if p_lckcnt on
4017 4006                           * original page is 1. Here, we are unlocking
4018 4007                           * read-only copy belonging to original page and
4019 4008                           * are locking a copy belonging to new page.
4020 4009                           */
4021 4010                          if (opp->p_lckcnt == 1)
4022 4011                                  payback = 1;
4023 4012  
4024 4013                          npp->p_lckcnt++;
4025 4014                          opp->p_lckcnt--;
4026 4015                  }
4027 4016          }
4028 4017          if (payback) {
4029 4018                  mutex_enter(&freemem_lock);
4030 4019                  availrmem++;
4031 4020                  pages_useclaim--;
4032 4021                  mutex_exit(&freemem_lock);
4033 4022          }
4034 4023  
4035 4024          if (nidx < oidx) {
4036 4025                  page_struct_unlock(opp);
4037 4026                  page_struct_unlock(npp);
4038 4027          } else if (oidx < nidx) {
4039 4028                  page_struct_unlock(npp);
4040 4029                  page_struct_unlock(opp);
4041 4030          } else {        /* The pages hash to the same lock */
4042 4031                  page_struct_unlock(npp);
4043 4032          }
4044 4033  }
4045 4034  
4046 4035  /*
4047 4036   * Simple claim adjust functions -- used to support changes in
4048 4037   * claims due to changes in access permissions.  Used by segvn_setprot().
4049 4038   */
4050 4039  int
4051 4040  page_addclaim(page_t *pp)
4052 4041  {
4053 4042          int r = 0;                      /* result */
4054 4043  
4055 4044          ASSERT(PAGE_LOCKED(pp));
4056 4045  
4057 4046          page_struct_lock(pp);
4058 4047          ASSERT(pp->p_lckcnt != 0);
4059 4048  
4060 4049          if (pp->p_lckcnt == 1) {
4061 4050                  if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4062 4051                          --pp->p_lckcnt;
4063 4052                          r = 1;
4064 4053                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4065 4054                                  cmn_err(CE_WARN,
4066 4055                                      "COW lock limit reached on pfn 0x%lx",
4067 4056                                      page_pptonum(pp));
4068 4057                          }
4069 4058                  }
4070 4059          } else {
4071 4060                  mutex_enter(&freemem_lock);
4072 4061                  if ((availrmem > pages_pp_maximum) &&
4073 4062                      (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4074 4063                          --availrmem;
4075 4064                          ++pages_claimed;
4076 4065                          mutex_exit(&freemem_lock);
4077 4066                          --pp->p_lckcnt;
4078 4067                          r = 1;
4079 4068                          if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4080 4069                                  cmn_err(CE_WARN,
4081 4070                                      "COW lock limit reached on pfn 0x%lx",
4082 4071                                      page_pptonum(pp));
4083 4072                          }
4084 4073                  } else
4085 4074                          mutex_exit(&freemem_lock);
4086 4075          }
4087 4076          page_struct_unlock(pp);
4088 4077          return (r);
4089 4078  }
4090 4079  
4091 4080  int
4092 4081  page_subclaim(page_t *pp)
4093 4082  {
4094 4083          int r = 0;
4095 4084  
4096 4085          ASSERT(PAGE_LOCKED(pp));
4097 4086  
4098 4087          page_struct_lock(pp);
4099 4088          ASSERT(pp->p_cowcnt != 0);
4100 4089  
4101 4090          if (pp->p_lckcnt) {
4102 4091                  if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4103 4092                          r = 1;
4104 4093                          /*
4105 4094                           * for availrmem
4106 4095                           */
4107 4096                          mutex_enter(&freemem_lock);
4108 4097                          availrmem++;
4109 4098                          pages_claimed--;
4110 4099                          mutex_exit(&freemem_lock);
4111 4100  
4112 4101                          pp->p_cowcnt--;
4113 4102  
4114 4103                          if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4115 4104                                  cmn_err(CE_WARN,
4116 4105                                      "Page lock limit reached on pfn 0x%lx",
4117 4106                                      page_pptonum(pp));
4118 4107                          }
4119 4108                  }
4120 4109          } else {
4121 4110                  r = 1;
4122 4111                  pp->p_cowcnt--;
4123 4112                  pp->p_lckcnt++;
4124 4113          }
4125 4114          page_struct_unlock(pp);
4126 4115          return (r);
4127 4116  }
4128 4117  
4129 4118  /*
4130 4119   * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4131 4120   * page.
4132 4121   */
4133 4122  int
4134 4123  page_addclaim_pages(page_t  **ppa)
4135 4124  {
4136 4125          pgcnt_t lckpgs = 0, pg_idx;
4137 4126  
4138 4127          VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4139 4128  
4140 4129          /*
4141 4130           * Only need to take the page struct lock on the large page root.
4142 4131           */
4143 4132          page_struct_lock(ppa[0]);
4144 4133          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4145 4134  
4146 4135                  ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4147 4136                  ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4148 4137                  if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4149 4138                          page_struct_unlock(ppa[0]);
4150 4139                          return (0);
4151 4140                  }
4152 4141                  if (ppa[pg_idx]->p_lckcnt > 1)
4153 4142                          lckpgs++;
4154 4143          }
4155 4144  
4156 4145          if (lckpgs != 0) {
4157 4146                  mutex_enter(&freemem_lock);
4158 4147                  if (availrmem >= pages_pp_maximum + lckpgs) {
4159 4148                          availrmem -= lckpgs;
4160 4149                          pages_claimed += lckpgs;
4161 4150                  } else {
4162 4151                          mutex_exit(&freemem_lock);
4163 4152                          page_struct_unlock(ppa[0]);
4164 4153                          return (0);
4165 4154                  }
4166 4155                  mutex_exit(&freemem_lock);
4167 4156          }
4168 4157  
4169 4158          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4170 4159                  ppa[pg_idx]->p_lckcnt--;
4171 4160                  ppa[pg_idx]->p_cowcnt++;
4172 4161          }
4173 4162          page_struct_unlock(ppa[0]);
4174 4163          return (1);
4175 4164  }
4176 4165  
4177 4166  /*
4178 4167   * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4179 4168   * page.
4180 4169   */
4181 4170  int
4182 4171  page_subclaim_pages(page_t  **ppa)
4183 4172  {
4184 4173          pgcnt_t ulckpgs = 0, pg_idx;
4185 4174  
4186 4175          VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4187 4176  
4188 4177          /*
4189 4178           * Only need to take the page struct lock on the large page root.
4190 4179           */
4191 4180          page_struct_lock(ppa[0]);
4192 4181          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4193 4182  
4194 4183                  ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4195 4184                  ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4196 4185                  if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4197 4186                          page_struct_unlock(ppa[0]);
4198 4187                          return (0);
4199 4188                  }
4200 4189                  if (ppa[pg_idx]->p_lckcnt != 0)
4201 4190                          ulckpgs++;
4202 4191          }
4203 4192  
4204 4193          if (ulckpgs != 0) {
4205 4194                  mutex_enter(&freemem_lock);
4206 4195                  availrmem += ulckpgs;
4207 4196                  pages_claimed -= ulckpgs;
4208 4197                  mutex_exit(&freemem_lock);
4209 4198          }
4210 4199  
4211 4200          for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4212 4201                  ppa[pg_idx]->p_cowcnt--;
4213 4202                  ppa[pg_idx]->p_lckcnt++;
4214 4203  
4215 4204          }
4216 4205          page_struct_unlock(ppa[0]);
4217 4206          return (1);
4218 4207  }
4219 4208  
4220 4209  page_t *
4221 4210  page_numtopp(pfn_t pfnum, se_t se)
4222 4211  {
4223 4212          page_t *pp;
4224 4213  
4225 4214  retry:
4226 4215          pp = page_numtopp_nolock(pfnum);
4227 4216          if (pp == NULL) {
4228 4217                  return ((page_t *)NULL);
4229 4218          }
4230 4219  
4231 4220          /*
4232 4221           * Acquire the appropriate lock on the page.
4233 4222           */
4234 4223          while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4235 4224                  if (page_pptonum(pp) != pfnum)
4236 4225                          goto retry;
4237 4226                  continue;
4238 4227          }
4239 4228  
4240 4229          if (page_pptonum(pp) != pfnum) {
4241 4230                  page_unlock(pp);
4242 4231                  goto retry;
4243 4232          }
4244 4233  
4245 4234          return (pp);
4246 4235  }
4247 4236  
4248 4237  page_t *
4249 4238  page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4250 4239  {
4251 4240          page_t *pp;
4252 4241  
4253 4242  retry:
4254 4243          pp = page_numtopp_nolock(pfnum);
4255 4244          if (pp == NULL) {
4256 4245                  return ((page_t *)NULL);
4257 4246          }
4258 4247  
4259 4248          /*
4260 4249           * Acquire the appropriate lock on the page.
4261 4250           */
4262 4251          while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4263 4252                  if (page_pptonum(pp) != pfnum)
4264 4253                          goto retry;
4265 4254                  continue;
4266 4255          }
4267 4256  
4268 4257          if (page_pptonum(pp) != pfnum) {
4269 4258                  page_unlock(pp);
4270 4259                  goto retry;
4271 4260          }
4272 4261  
4273 4262          return (pp);
4274 4263  }
4275 4264  
4276 4265  /*
4277 4266   * This routine is like page_numtopp, but will only return page structs
4278 4267   * for pages which are ok for loading into hardware using the page struct.
4279 4268   */
4280 4269  page_t *
4281 4270  page_numtopp_nowait(pfn_t pfnum, se_t se)
4282 4271  {
4283 4272          page_t *pp;
4284 4273  
4285 4274  retry:
4286 4275          pp = page_numtopp_nolock(pfnum);
4287 4276          if (pp == NULL) {
4288 4277                  return ((page_t *)NULL);
4289 4278          }
4290 4279  
4291 4280          /*
4292 4281           * Try to acquire the appropriate lock on the page.
4293 4282           */
4294 4283          if (PP_ISFREE(pp))
4295 4284                  pp = NULL;
4296 4285          else {
4297 4286                  if (!page_trylock(pp, se))
4298 4287                          pp = NULL;
4299 4288                  else {
4300 4289                          if (page_pptonum(pp) != pfnum) {
4301 4290                                  page_unlock(pp);
4302 4291                                  goto retry;
4303 4292                          }
4304 4293                          if (PP_ISFREE(pp)) {
4305 4294                                  page_unlock(pp);
4306 4295                                  pp = NULL;
4307 4296                          }
4308 4297                  }
4309 4298          }
4310 4299          return (pp);
4311 4300  }
4312 4301  
4313 4302  #define SYNC_PROGRESS_NPAGES    1000
4314 4303  
4315 4304  /*
4316 4305   * Returns a count of dirty pages that are in the process
4317 4306   * of being written out.  If 'cleanit' is set, try to push the page.
4318 4307   */
4319 4308  pgcnt_t
4320 4309  page_busy(int cleanit)
4321 4310  {
4322 4311          page_t *page0 = page_first();
4323 4312          page_t *pp = page0;
4324 4313          pgcnt_t nppbusy = 0;
4325 4314          int counter = 0;
4326 4315          u_offset_t off;
4327 4316  
4328 4317          do {
4329 4318                  vnode_t *vp = pp->p_vnode;
4330 4319  
4331 4320                  /*
4332 4321                   * Reset the sync timeout. The page list is very long
4333 4322                   * on large memory systems.
4334 4323                   */
4335 4324                  if (++counter > SYNC_PROGRESS_NPAGES) {
4336 4325                          counter = 0;
4337 4326                          vfs_syncprogress();
4338 4327                  }
4339 4328  
4340 4329                  /*
4341 4330                   * A page is a candidate for syncing if it is:
4342 4331                   *
4343 4332                   * (a)  On neither the freelist nor the cachelist
4344 4333                   * (b)  Hashed onto a vnode
4345 4334                   * (c)  Not a kernel page
4346 4335                   * (d)  Dirty
4347 4336                   * (e)  Not part of a swapfile
4348 4337                   * (f)  a page which belongs to a real vnode; eg has a non-null
4349 4338                   *      v_vfsp pointer.
4350 4339                   * (g)  Backed by a filesystem which doesn't have a
4351 4340                   *      stubbed-out sync operation
4352 4341                   */
4353 4342                  if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4354 4343                      hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4355 4344                      vfs_can_sync(vp->v_vfsp)) {
4356 4345                          nppbusy++;
4357 4346  
4358 4347                          if (!cleanit)
4359 4348                                  continue;
4360 4349                          if (!page_trylock(pp, SE_EXCL))
4361 4350                                  continue;
4362 4351  
4363 4352                          if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4364 4353                              pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4365 4354                              !(hat_pagesync(pp,
4366 4355                              HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4367 4356                                  page_unlock(pp);
4368 4357                                  continue;
4369 4358                          }
4370 4359                          off = pp->p_offset;
4371 4360                          VN_HOLD(vp);
4372 4361                          page_unlock(pp);
4373 4362                          (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4374 4363                              B_ASYNC | B_FREE, kcred, NULL);
4375 4364                          VN_RELE(vp);
4376 4365                  }
4377 4366          } while ((pp = page_next(pp)) != page0);
4378 4367  
4379 4368          vfs_syncprogress();
4380 4369          return (nppbusy);
4381 4370  }
4382 4371  
4383 4372  void page_invalidate_pages(void);
4384 4373  
4385 4374  /*
4386 4375   * callback handler to vm sub-system
4387 4376   *
4388 4377   * callers make sure no recursive entries to this func.
4389 4378   */
4390 4379  /*ARGSUSED*/
4391 4380  boolean_t
4392 4381  callb_vm_cpr(void *arg, int code)
4393 4382  {
4394 4383          if (code == CB_CODE_CPR_CHKPT)
4395 4384                  page_invalidate_pages();
4396 4385          return (B_TRUE);
4397 4386  }
4398 4387  
4399 4388  /*
4400 4389   * Invalidate all pages of the system.
4401 4390   * It shouldn't be called until all user page activities are all stopped.
4402 4391   */
4403 4392  void
4404 4393  page_invalidate_pages()
4405 4394  {
4406 4395          page_t *pp;
4407 4396          page_t *page0;
4408 4397          pgcnt_t nbusypages;
4409 4398          int retry = 0;
4410 4399          const int MAXRETRIES = 4;
4411 4400  top:
4412 4401          /*
4413 4402           * Flush dirty pages and destroy the clean ones.
4414 4403           */
4415 4404          nbusypages = 0;
4416 4405  
4417 4406          pp = page0 = page_first();
4418 4407          do {
4419 4408                  struct vnode    *vp;
4420 4409                  u_offset_t      offset;
4421 4410                  int             mod;
4422 4411  
4423 4412                  /*
4424 4413                   * skip the page if it has no vnode or the page associated
4425 4414                   * with the kernel vnode or prom allocated kernel mem.
4426 4415                   */
4427 4416                  if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4428 4417                          continue;
4429 4418  
4430 4419                  /*
4431 4420                   * skip the page which is already free invalidated.
4432 4421                   */
4433 4422                  if (PP_ISFREE(pp) && PP_ISAGED(pp))
4434 4423                          continue;
4435 4424  
4436 4425                  /*
4437 4426                   * skip pages that are already locked or can't be "exclusively"
4438 4427                   * locked or are already free.  After we lock the page, check
4439 4428                   * the free and age bits again to be sure it's not destroyed
4440 4429                   * yet.
4441 4430                   * To achieve max. parallelization, we use page_trylock instead
4442 4431                   * of page_lock so that we don't get block on individual pages
4443 4432                   * while we have thousands of other pages to process.
4444 4433                   */
4445 4434                  if (!page_trylock(pp, SE_EXCL)) {
4446 4435                          nbusypages++;
4447 4436                          continue;
4448 4437                  } else if (PP_ISFREE(pp)) {
4449 4438                          if (!PP_ISAGED(pp)) {
4450 4439                                  page_destroy_free(pp);
4451 4440                          } else {
4452 4441                                  page_unlock(pp);
4453 4442                          }
4454 4443                          continue;
4455 4444                  }
4456 4445                  /*
4457 4446                   * Is this page involved in some I/O? shared?
4458 4447                   *
4459 4448                   * The page_struct_lock need not be acquired to
4460 4449                   * examine these fields since the page has an
4461 4450                   * "exclusive" lock.
4462 4451                   */
4463 4452                  if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4464 4453                          page_unlock(pp);
4465 4454                          continue;
4466 4455                  }
4467 4456  
4468 4457                  if (vp->v_type == VCHR) {
4469 4458                          panic("vp->v_type == VCHR");
4470 4459                          /*NOTREACHED*/
4471 4460                  }
4472 4461  
4473 4462                  if (!page_try_demote_pages(pp)) {
4474 4463                          page_unlock(pp);
4475 4464                          continue;
4476 4465                  }
4477 4466  
4478 4467                  /*
4479 4468                   * Check the modified bit. Leave the bits alone in hardware
4480 4469                   * (they will be modified if we do the putpage).
4481 4470                   */
4482 4471                  mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4483 4472                      & P_MOD);
4484 4473                  if (mod) {
4485 4474                          offset = pp->p_offset;
4486 4475                          /*
4487 4476                           * Hold the vnode before releasing the page lock
4488 4477                           * to prevent it from being freed and re-used by
4489 4478                           * some other thread.
4490 4479                           */
4491 4480                          VN_HOLD(vp);
4492 4481                          page_unlock(pp);
4493 4482                          /*
4494 4483                           * No error return is checked here. Callers such as
4495 4484                           * cpr deals with the dirty pages at the dump time
4496 4485                           * if this putpage fails.
4497 4486                           */
4498 4487                          (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4499 4488                              kcred, NULL);
4500 4489                          VN_RELE(vp);
4501 4490                  } else {
4502 4491                          /*LINTED: constant in conditional context*/
4503 4492                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
4504 4493                  }
4505 4494          } while ((pp = page_next(pp)) != page0);
4506 4495          if (nbusypages && retry++ < MAXRETRIES) {
4507 4496                  delay(1);
4508 4497                  goto top;
4509 4498          }
4510 4499  }
4511 4500  
4512 4501  /*
4513 4502   * Replace the page "old" with the page "new" on the page hash and vnode lists
4514 4503   *
4515 4504   * the replacement must be done in place, ie the equivalent sequence:
4516 4505   *
4517 4506   *      vp = old->p_vnode;
4518 4507   *      off = old->p_offset;
4519 4508   *      page_do_hashout(old)
4520 4509   *      page_do_hashin(new, vp, off)
4521 4510   *
4522 4511   * doesn't work, since
4523 4512   *  1) if old is the only page on the vnode, the v_pages list has a window
4524 4513   *     where it looks empty. This will break file system assumptions.
4525 4514   * and
4526 4515   *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4527 4516   */
4528 4517  static void
4529 4518  page_do_relocate_hash(page_t *new, page_t *old)
4530 4519  {
4531 4520          page_t  **hash_list;
4532 4521          vnode_t *vp = old->p_vnode;
4533 4522          kmutex_t *sep;
4534 4523  
4535 4524          ASSERT(PAGE_EXCL(old));
4536 4525          ASSERT(PAGE_EXCL(new));
4537 4526          ASSERT(vp != NULL);
4538 4527          ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4539 4528          ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4540 4529  
4541 4530          /*
4542 4531           * First find old page on the page hash list
4543 4532           */
4544 4533          hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4545 4534  
4546 4535          for (;;) {
4547 4536                  if (*hash_list == old)
4548 4537                          break;
4549 4538                  if (*hash_list == NULL) {
4550 4539                          panic("page_do_hashout");
4551 4540                          /*NOTREACHED*/
4552 4541                  }
4553 4542                  hash_list = &(*hash_list)->p_hash;
4554 4543          }
4555 4544  
4556 4545          /*
4557 4546           * update new and replace old with new on the page hash list
4558 4547           */
4559 4548          new->p_vnode = old->p_vnode;
4560 4549          new->p_offset = old->p_offset;
4561 4550          new->p_hash = old->p_hash;
4562 4551          *hash_list = new;
4563 4552  
4564 4553          if ((new->p_vnode->v_flag & VISSWAP) != 0)
4565 4554                  PP_SETSWAP(new);
4566 4555  
4567 4556          /*
4568 4557           * replace old with new on the vnode's page list
4569 4558           */
4570 4559          if (old->p_vpnext == old) {
4571 4560                  new->p_vpnext = new;
4572 4561                  new->p_vpprev = new;
4573 4562          } else {
4574 4563                  new->p_vpnext = old->p_vpnext;
4575 4564                  new->p_vpprev = old->p_vpprev;
4576 4565                  new->p_vpnext->p_vpprev = new;
4577 4566                  new->p_vpprev->p_vpnext = new;
4578 4567          }
4579 4568          if (vp->v_pages == old)
4580 4569                  vp->v_pages = new;
4581 4570  
4582 4571          /*
4583 4572           * clear out the old page
4584 4573           */
4585 4574          old->p_hash = NULL;
4586 4575          old->p_vpnext = NULL;
4587 4576          old->p_vpprev = NULL;
4588 4577          old->p_vnode = NULL;
4589 4578          PP_CLRSWAP(old);
4590 4579          old->p_offset = (u_offset_t)-1;
4591 4580          page_clr_all_props(old);
4592 4581  
4593 4582          /*
4594 4583           * Wake up processes waiting for this page.  The page's
4595 4584           * identity has been changed, and is probably not the
4596 4585           * desired page any longer.
4597 4586           */
4598 4587          sep = page_se_mutex(old);
4599 4588          mutex_enter(sep);
4600 4589          old->p_selock &= ~SE_EWANTED;
4601 4590          if (CV_HAS_WAITERS(&old->p_cv))
4602 4591                  cv_broadcast(&old->p_cv);
4603 4592          mutex_exit(sep);
4604 4593  }
4605 4594  
4606 4595  /*
4607 4596   * This function moves the identity of page "pp_old" to page "pp_new".
4608 4597   * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4609 4598   * and need not be hashed out from anywhere.
4610 4599   */
4611 4600  void
4612 4601  page_relocate_hash(page_t *pp_new, page_t *pp_old)
4613 4602  {
4614 4603          vnode_t *vp = pp_old->p_vnode;
4615 4604          u_offset_t off = pp_old->p_offset;
4616 4605          kmutex_t *phm, *vphm;
4617 4606  
4618 4607          /*
4619 4608           * Rehash two pages
4620 4609           */
4621 4610          ASSERT(PAGE_EXCL(pp_old));
4622 4611          ASSERT(PAGE_EXCL(pp_new));
4623 4612          ASSERT(vp != NULL);
4624 4613          ASSERT(pp_new->p_vnode == NULL);
4625 4614  
4626 4615          /*
4627 4616           * hashout then hashin while holding the mutexes
4628 4617           */
4629 4618          phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4630 4619          mutex_enter(phm);
4631 4620          vphm = page_vnode_mutex(vp);
4632 4621          mutex_enter(vphm);
4633 4622  
4634 4623          page_do_relocate_hash(pp_new, pp_old);
4635 4624  
4636 4625          /* The following comment preserved from page_flip(). */
4637 4626          pp_new->p_fsdata = pp_old->p_fsdata;
4638 4627          pp_old->p_fsdata = 0;
4639 4628          mutex_exit(vphm);
4640 4629          mutex_exit(phm);
4641 4630  
4642 4631          /*
4643 4632           * The page_struct_lock need not be acquired for lckcnt and
4644 4633           * cowcnt since the page has an "exclusive" lock.
4645 4634           */
4646 4635          ASSERT(pp_new->p_lckcnt == 0);
4647 4636          ASSERT(pp_new->p_cowcnt == 0);
4648 4637          pp_new->p_lckcnt = pp_old->p_lckcnt;
4649 4638          pp_new->p_cowcnt = pp_old->p_cowcnt;
4650 4639          pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4651 4640  
4652 4641  }
4653 4642  
4654 4643  /*
4655 4644   * Helper routine used to lock all remaining members of a
4656 4645   * large page. The caller is responsible for passing in a locked
4657 4646   * pp. If pp is a large page, then it succeeds in locking all the
4658 4647   * remaining constituent pages or it returns with only the
4659 4648   * original page locked.
4660 4649   *
4661 4650   * Returns 1 on success, 0 on failure.
4662 4651   *
4663 4652   * If success is returned this routine guarantees p_szc for all constituent
4664 4653   * pages of a large page pp belongs to can't change. To achieve this we
4665 4654   * recheck szc of pp after locking all constituent pages and retry if szc
4666 4655   * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4667 4656   * lock on one of constituent pages it can't be running after all constituent
4668 4657   * pages are locked.  hat_page_demote() with a lock on a constituent page
4669 4658   * outside of this large page (i.e. pp belonged to a larger large page) is
4670 4659   * already done with all constituent pages of pp since the root's p_szc is
4671 4660   * changed last. Therefore no need to synchronize with hat_page_demote() that
4672 4661   * locked a constituent page outside of pp's current large page.
4673 4662   */
4674 4663  #ifdef DEBUG
4675 4664  uint32_t gpg_trylock_mtbf = 0;
4676 4665  #endif
4677 4666  
4678 4667  int
4679 4668  group_page_trylock(page_t *pp, se_t se)
4680 4669  {
4681 4670          page_t  *tpp;
4682 4671          pgcnt_t npgs, i, j;
4683 4672          uint_t pszc = pp->p_szc;
4684 4673  
4685 4674  #ifdef DEBUG
4686 4675          if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4687 4676                  return (0);
4688 4677          }
4689 4678  #endif
4690 4679  
4691 4680          if (pp != PP_GROUPLEADER(pp, pszc)) {
4692 4681                  return (0);
4693 4682          }
4694 4683  
4695 4684  retry:
4696 4685          ASSERT(PAGE_LOCKED_SE(pp, se));
4697 4686          ASSERT(!PP_ISFREE(pp));
4698 4687          if (pszc == 0) {
4699 4688                  return (1);
4700 4689          }
4701 4690          npgs = page_get_pagecnt(pszc);
4702 4691          tpp = pp + 1;
4703 4692          for (i = 1; i < npgs; i++, tpp++) {
4704 4693                  if (!page_trylock(tpp, se)) {
4705 4694                          tpp = pp + 1;
4706 4695                          for (j = 1; j < i; j++, tpp++) {
4707 4696                                  page_unlock(tpp);
4708 4697                          }
4709 4698                          return (0);
4710 4699                  }
4711 4700          }
4712 4701          if (pp->p_szc != pszc) {
4713 4702                  ASSERT(pp->p_szc < pszc);
4714 4703                  ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4715 4704                      !IS_SWAPFSVP(pp->p_vnode));
4716 4705                  tpp = pp + 1;
4717 4706                  for (i = 1; i < npgs; i++, tpp++) {
4718 4707                          page_unlock(tpp);
4719 4708                  }
4720 4709                  pszc = pp->p_szc;
4721 4710                  goto retry;
4722 4711          }
4723 4712          return (1);
4724 4713  }
4725 4714  
4726 4715  void
4727 4716  group_page_unlock(page_t *pp)
4728 4717  {
4729 4718          page_t *tpp;
4730 4719          pgcnt_t npgs, i;
4731 4720  
4732 4721          ASSERT(PAGE_LOCKED(pp));
4733 4722          ASSERT(!PP_ISFREE(pp));
4734 4723          ASSERT(pp == PP_PAGEROOT(pp));
4735 4724          npgs = page_get_pagecnt(pp->p_szc);
4736 4725          for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4737 4726                  page_unlock(tpp);
4738 4727          }
4739 4728  }
4740 4729  
4741 4730  /*
4742 4731   * returns
4743 4732   * 0            : on success and *nrelocp is number of relocated PAGESIZE pages
4744 4733   * ERANGE       : this is not a base page
4745 4734   * EBUSY        : failure to get locks on the page/pages
4746 4735   * ENOMEM       : failure to obtain replacement pages
4747 4736   * EAGAIN       : OBP has not yet completed its boot-time handoff to the kernel
4748 4737   * EIO          : An error occurred while trying to copy the page data
4749 4738   *
4750 4739   * Return with all constituent members of target and replacement
4751 4740   * SE_EXCL locked. It is the callers responsibility to drop the
4752 4741   * locks.
4753 4742   */
4754 4743  int
4755 4744  do_page_relocate(
4756 4745          page_t **target,
4757 4746          page_t **replacement,
4758 4747          int grouplock,
4759 4748          spgcnt_t *nrelocp,
4760 4749          lgrp_t *lgrp)
4761 4750  {
4762 4751          page_t *first_repl;
4763 4752          page_t *repl;
4764 4753          page_t *targ;
4765 4754          page_t *pl = NULL;
4766 4755          uint_t ppattr;
4767 4756          pfn_t   pfn, repl_pfn;
4768 4757          uint_t  szc;
4769 4758          spgcnt_t npgs, i;
4770 4759          int repl_contig = 0;
4771 4760          uint_t flags = 0;
4772 4761          spgcnt_t dofree = 0;
4773 4762  
4774 4763          *nrelocp = 0;
4775 4764  
4776 4765  #if defined(__sparc)
4777 4766          /*
4778 4767           * We need to wait till OBP has completed
4779 4768           * its boot-time handoff of its resources to the kernel
4780 4769           * before we allow page relocation
4781 4770           */
4782 4771          if (page_relocate_ready == 0) {
4783 4772                  return (EAGAIN);
4784 4773          }
4785 4774  #endif
4786 4775  
4787 4776          /*
4788 4777           * If this is not a base page,
4789 4778           * just return with 0x0 pages relocated.
4790 4779           */
4791 4780          targ = *target;
4792 4781          ASSERT(PAGE_EXCL(targ));
4793 4782          ASSERT(!PP_ISFREE(targ));
4794 4783          szc = targ->p_szc;
4795 4784          ASSERT(szc < mmu_page_sizes);
4796 4785          VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4797 4786          pfn = targ->p_pagenum;
4798 4787          if (pfn != PFN_BASE(pfn, szc)) {
4799 4788                  VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4800 4789                  return (ERANGE);
4801 4790          }
4802 4791  
4803 4792          if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4804 4793                  repl_pfn = repl->p_pagenum;
4805 4794                  if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4806 4795                          VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4807 4796                          return (ERANGE);
4808 4797                  }
4809 4798                  repl_contig = 1;
4810 4799          }
4811 4800  
4812 4801          /*
4813 4802           * We must lock all members of this large page or we cannot
4814 4803           * relocate any part of it.
4815 4804           */
4816 4805          if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4817 4806                  VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4818 4807                  return (EBUSY);
4819 4808          }
4820 4809  
4821 4810          /*
4822 4811           * reread szc it could have been decreased before
4823 4812           * group_page_trylock() was done.
4824 4813           */
4825 4814          szc = targ->p_szc;
4826 4815          ASSERT(szc < mmu_page_sizes);
4827 4816          VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4828 4817          ASSERT(pfn == PFN_BASE(pfn, szc));
4829 4818  
4830 4819          npgs = page_get_pagecnt(targ->p_szc);
4831 4820  
4832 4821          if (repl == NULL) {
4833 4822                  dofree = npgs;          /* Size of target page in MMU pages */
4834 4823                  if (!page_create_wait(dofree, 0)) {
4835 4824                          if (grouplock != 0) {
4836 4825                                  group_page_unlock(targ);
4837 4826                          }
4838 4827                          VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4839 4828                          return (ENOMEM);
4840 4829                  }
4841 4830  
4842 4831                  /*
4843 4832                   * seg kmem pages require that the target and replacement
4844 4833                   * page be the same pagesize.
4845 4834                   */
4846 4835                  flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4847 4836                  repl = page_get_replacement_page(targ, lgrp, flags);
4848 4837                  if (repl == NULL) {
4849 4838                          if (grouplock != 0) {
4850 4839                                  group_page_unlock(targ);
4851 4840                          }
4852 4841                          page_create_putback(dofree);
4853 4842                          VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4854 4843                          return (ENOMEM);
4855 4844                  }
4856 4845          }
4857 4846  #ifdef DEBUG
4858 4847          else {
4859 4848                  ASSERT(PAGE_LOCKED(repl));
4860 4849          }
4861 4850  #endif /* DEBUG */
4862 4851  
4863 4852  #if defined(__sparc)
4864 4853          /*
4865 4854           * Let hat_page_relocate() complete the relocation if it's kernel page
4866 4855           */
4867 4856          if (VN_ISKAS(targ->p_vnode)) {
4868 4857                  *replacement = repl;
4869 4858                  if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4870 4859                          if (grouplock != 0) {
4871 4860                                  group_page_unlock(targ);
4872 4861                          }
4873 4862                          if (dofree) {
4874 4863                                  *replacement = NULL;
4875 4864                                  page_free_replacement_page(repl);
4876 4865                                  page_create_putback(dofree);
4877 4866                          }
4878 4867                          VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4879 4868                          return (EAGAIN);
4880 4869                  }
4881 4870                  VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4882 4871                  return (0);
4883 4872          }
4884 4873  #else
4885 4874  #if defined(lint)
4886 4875          dofree = dofree;
4887 4876  #endif
4888 4877  #endif
4889 4878  
4890 4879          first_repl = repl;
4891 4880  
4892 4881          for (i = 0; i < npgs; i++) {
4893 4882                  ASSERT(PAGE_EXCL(targ));
4894 4883                  ASSERT(targ->p_slckcnt == 0);
4895 4884                  ASSERT(repl->p_slckcnt == 0);
4896 4885  
4897 4886                  (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4898 4887  
4899 4888                  ASSERT(hat_page_getshare(targ) == 0);
4900 4889                  ASSERT(!PP_ISFREE(targ));
4901 4890                  ASSERT(targ->p_pagenum == (pfn + i));
4902 4891                  ASSERT(repl_contig == 0 ||
4903 4892                      repl->p_pagenum == (repl_pfn + i));
4904 4893  
4905 4894                  /*
4906 4895                   * Copy the page contents and attributes then
4907 4896                   * relocate the page in the page hash.
4908 4897                   */
4909 4898                  if (ppcopy(targ, repl) == 0) {
4910 4899                          targ = *target;
4911 4900                          repl = first_repl;
4912 4901                          VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4913 4902                          if (grouplock != 0) {
4914 4903                                  group_page_unlock(targ);
4915 4904                          }
4916 4905                          if (dofree) {
4917 4906                                  *replacement = NULL;
4918 4907                                  page_free_replacement_page(repl);
4919 4908                                  page_create_putback(dofree);
4920 4909                          }
4921 4910                          return (EIO);
4922 4911                  }
4923 4912  
4924 4913                  targ++;
4925 4914                  if (repl_contig != 0) {
4926 4915                          repl++;
4927 4916                  } else {
4928 4917                          repl = repl->p_next;
4929 4918                  }
4930 4919          }
4931 4920  
4932 4921          repl = first_repl;
4933 4922          targ = *target;
4934 4923  
4935 4924          for (i = 0; i < npgs; i++) {
4936 4925                  ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4937 4926                  page_clr_all_props(repl);
4938 4927                  page_set_props(repl, ppattr);
4939 4928                  page_relocate_hash(repl, targ);
4940 4929  
4941 4930                  ASSERT(hat_page_getshare(targ) == 0);
4942 4931                  ASSERT(hat_page_getshare(repl) == 0);
4943 4932                  /*
4944 4933                   * Now clear the props on targ, after the
4945 4934                   * page_relocate_hash(), they no longer
4946 4935                   * have any meaning.
4947 4936                   */
4948 4937                  page_clr_all_props(targ);
4949 4938                  ASSERT(targ->p_next == targ);
4950 4939                  ASSERT(targ->p_prev == targ);
4951 4940                  page_list_concat(&pl, &targ);
4952 4941  
4953 4942                  targ++;
4954 4943                  if (repl_contig != 0) {
4955 4944                          repl++;
4956 4945                  } else {
4957 4946                          repl = repl->p_next;
4958 4947                  }
4959 4948          }
4960 4949          /* assert that we have come full circle with repl */
4961 4950          ASSERT(repl_contig == 1 || first_repl == repl);
4962 4951  
4963 4952          *target = pl;
4964 4953          if (*replacement == NULL) {
4965 4954                  ASSERT(first_repl == repl);
4966 4955                  *replacement = repl;
4967 4956          }
4968 4957          VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4969 4958          *nrelocp = npgs;
4970 4959          return (0);
4971 4960  }
4972 4961  /*
4973 4962   * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4974 4963   */
4975 4964  int
4976 4965  page_relocate(
4977 4966          page_t **target,
4978 4967          page_t **replacement,
4979 4968          int grouplock,
4980 4969          int freetarget,
4981 4970          spgcnt_t *nrelocp,
4982 4971          lgrp_t *lgrp)
4983 4972  {
4984 4973          spgcnt_t ret;
4985 4974  
4986 4975          /* do_page_relocate returns 0 on success or errno value */
4987 4976          ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4988 4977  
4989 4978          if (ret != 0 || freetarget == 0) {
4990 4979                  return (ret);
4991 4980          }
4992 4981          if (*nrelocp == 1) {
4993 4982                  ASSERT(*target != NULL);
4994 4983                  page_free(*target, 1);
4995 4984          } else {
4996 4985                  page_t *tpp = *target;
4997 4986                  uint_t szc = tpp->p_szc;
4998 4987                  pgcnt_t npgs = page_get_pagecnt(szc);
4999 4988                  ASSERT(npgs > 1);
5000 4989                  ASSERT(szc != 0);
5001 4990                  do {
5002 4991                          ASSERT(PAGE_EXCL(tpp));
5003 4992                          ASSERT(!hat_page_is_mapped(tpp));
5004 4993                          ASSERT(tpp->p_szc == szc);
5005 4994                          PP_SETFREE(tpp);
5006 4995                          PP_SETAGED(tpp);
5007 4996                          npgs--;
5008 4997                  } while ((tpp = tpp->p_next) != *target);
5009 4998                  ASSERT(npgs == 0);
5010 4999                  page_list_add_pages(*target, 0);
5011 5000                  npgs = page_get_pagecnt(szc);
5012 5001                  page_create_putback(npgs);
5013 5002          }
5014 5003          return (ret);
5015 5004  }
5016 5005  
5017 5006  /*
5018 5007   * it is up to the caller to deal with pcf accounting.
5019 5008   */
5020 5009  void
5021 5010  page_free_replacement_page(page_t *pplist)
5022 5011  {
5023 5012          page_t *pp;
5024 5013  
5025 5014          while (pplist != NULL) {
5026 5015                  /*
5027 5016                   * pp_targ is a linked list.
5028 5017                   */
5029 5018                  pp = pplist;
5030 5019                  if (pp->p_szc == 0) {
5031 5020                          page_sub(&pplist, pp);
5032 5021                          page_clr_all_props(pp);
5033 5022                          PP_SETFREE(pp);
5034 5023                          PP_SETAGED(pp);
5035 5024                          page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5036 5025                          page_unlock(pp);
5037 5026                          VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5038 5027                  } else {
5039 5028                          spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5040 5029                          page_t *tpp;
5041 5030                          page_list_break(&pp, &pplist, curnpgs);
5042 5031                          tpp = pp;
5043 5032                          do {
5044 5033                                  ASSERT(PAGE_EXCL(tpp));
5045 5034                                  ASSERT(!hat_page_is_mapped(tpp));
5046 5035                                  page_clr_all_props(tpp);
5047 5036                                  PP_SETFREE(tpp);
5048 5037                                  PP_SETAGED(tpp);
5049 5038                          } while ((tpp = tpp->p_next) != pp);
5050 5039                          page_list_add_pages(pp, 0);
5051 5040                          VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5052 5041                  }
5053 5042          }
5054 5043  }
5055 5044  
5056 5045  /*
5057 5046   * Relocate target to non-relocatable replacement page.
5058 5047   */
5059 5048  int
5060 5049  page_relocate_cage(page_t **target, page_t **replacement)
5061 5050  {
5062 5051          page_t *tpp, *rpp;
5063 5052          spgcnt_t pgcnt, npgs;
5064 5053          int result;
5065 5054  
5066 5055          tpp = *target;
5067 5056  
5068 5057          ASSERT(PAGE_EXCL(tpp));
5069 5058          ASSERT(tpp->p_szc == 0);
5070 5059  
5071 5060          pgcnt = btop(page_get_pagesize(tpp->p_szc));
5072 5061  
5073 5062          do {
5074 5063                  (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5075 5064                  rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5076 5065                  if (rpp == NULL) {
5077 5066                          page_create_putback(pgcnt);
5078 5067                          kcage_cageout_wakeup();
5079 5068                  }
5080 5069          } while (rpp == NULL);
5081 5070  
5082 5071          ASSERT(PP_ISNORELOC(rpp));
5083 5072  
5084 5073          result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5085 5074  
5086 5075          if (result == 0) {
5087 5076                  *replacement = rpp;
5088 5077                  if (pgcnt != npgs)
5089 5078                          panic("page_relocate_cage: partial relocation");
5090 5079          }
5091 5080  
5092 5081          return (result);
5093 5082  }
5094 5083  
5095 5084  /*
5096 5085   * Release the page lock on a page, place on cachelist
5097 5086   * tail if no longer mapped. Caller can let us know if
5098 5087   * the page is known to be clean.
5099 5088   */
5100 5089  int
5101 5090  page_release(page_t *pp, int checkmod)
5102 5091  {
5103 5092          int status;
5104 5093  
5105 5094          ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5106 5095              (pp->p_vnode != NULL));
5107 5096  
5108 5097          if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5109 5098              ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5110 5099              pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5111 5100              !hat_page_is_mapped(pp)) {
5112 5101  
5113 5102                  /*
5114 5103                   * If page is modified, unlock it
5115 5104                   *
5116 5105                   * (p_nrm & P_MOD) bit has the latest stuff because:
5117 5106                   * (1) We found that this page doesn't have any mappings
5118 5107                   *      _after_ holding SE_EXCL and
5119 5108                   * (2) We didn't drop SE_EXCL lock after the check in (1)
5120 5109                   */
5121 5110                  if (checkmod && hat_ismod(pp)) {
5122 5111                          page_unlock(pp);
5123 5112                          status = PGREL_MOD;
5124 5113                  } else {
5125 5114                          /*LINTED: constant in conditional context*/
5126 5115                          VN_DISPOSE(pp, B_FREE, 0, kcred);
5127 5116                          status = PGREL_CLEAN;
5128 5117                  }
5129 5118          } else {
5130 5119                  page_unlock(pp);
5131 5120                  status = PGREL_NOTREL;
5132 5121          }
5133 5122          return (status);
5134 5123  }
5135 5124  
5136 5125  /*
5137 5126   * Given a constituent page, try to demote the large page on the freelist.
5138 5127   *
5139 5128   * Returns nonzero if the page could be demoted successfully. Returns with
5140 5129   * the constituent page still locked.
5141 5130   */
5142 5131  int
5143 5132  page_try_demote_free_pages(page_t *pp)
5144 5133  {
5145 5134          page_t *rootpp = pp;
5146 5135          pfn_t   pfn = page_pptonum(pp);
5147 5136          spgcnt_t npgs;
5148 5137          uint_t  szc = pp->p_szc;
5149 5138  
5150 5139          ASSERT(PP_ISFREE(pp));
5151 5140          ASSERT(PAGE_EXCL(pp));
5152 5141  
5153 5142          /*
5154 5143           * Adjust rootpp and lock it, if `pp' is not the base
5155 5144           * constituent page.
5156 5145           */
5157 5146          npgs = page_get_pagecnt(pp->p_szc);
5158 5147          if (npgs == 1) {
5159 5148                  return (0);
5160 5149          }
5161 5150  
5162 5151          if (!IS_P2ALIGNED(pfn, npgs)) {
5163 5152                  pfn = P2ALIGN(pfn, npgs);
5164 5153                  rootpp = page_numtopp_nolock(pfn);
5165 5154          }
5166 5155  
5167 5156          if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5168 5157                  return (0);
5169 5158          }
5170 5159  
5171 5160          if (rootpp->p_szc != szc) {
5172 5161                  if (pp != rootpp)
5173 5162                          page_unlock(rootpp);
5174 5163                  return (0);
5175 5164          }
5176 5165  
5177 5166          page_demote_free_pages(rootpp);
5178 5167  
5179 5168          if (pp != rootpp)
5180 5169                  page_unlock(rootpp);
5181 5170  
5182 5171          ASSERT(PP_ISFREE(pp));
5183 5172          ASSERT(PAGE_EXCL(pp));
5184 5173          return (1);
5185 5174  }
5186 5175  
5187 5176  /*
5188 5177   * Given a constituent page, try to demote the large page.
5189 5178   *
5190 5179   * Returns nonzero if the page could be demoted successfully. Returns with
5191 5180   * the constituent page still locked.
5192 5181   */
5193 5182  int
5194 5183  page_try_demote_pages(page_t *pp)
5195 5184  {
5196 5185          page_t *tpp, *rootpp = pp;
5197 5186          pfn_t   pfn = page_pptonum(pp);
5198 5187          spgcnt_t i, npgs;
5199 5188          uint_t  szc = pp->p_szc;
5200 5189          vnode_t *vp = pp->p_vnode;
5201 5190  
5202 5191          ASSERT(PAGE_EXCL(pp));
5203 5192  
5204 5193          VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5205 5194  
5206 5195          if (pp->p_szc == 0) {
5207 5196                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5208 5197                  return (1);
5209 5198          }
5210 5199  
5211 5200          if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5212 5201                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5213 5202                  page_demote_vp_pages(pp);
5214 5203                  ASSERT(pp->p_szc == 0);
5215 5204                  return (1);
5216 5205          }
5217 5206  
5218 5207          /*
5219 5208           * Adjust rootpp if passed in is not the base
5220 5209           * constituent page.
5221 5210           */
5222 5211          npgs = page_get_pagecnt(pp->p_szc);
5223 5212          ASSERT(npgs > 1);
5224 5213          if (!IS_P2ALIGNED(pfn, npgs)) {
5225 5214                  pfn = P2ALIGN(pfn, npgs);
5226 5215                  rootpp = page_numtopp_nolock(pfn);
5227 5216                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5228 5217                  ASSERT(rootpp->p_vnode != NULL);
5229 5218                  ASSERT(rootpp->p_szc == szc);
5230 5219          }
5231 5220  
5232 5221          /*
5233 5222           * We can't demote kernel pages since we can't hat_unload()
5234 5223           * the mappings.
5235 5224           */
5236 5225          if (VN_ISKAS(rootpp->p_vnode))
5237 5226                  return (0);
5238 5227  
5239 5228          /*
5240 5229           * Attempt to lock all constituent pages except the page passed
5241 5230           * in since it's already locked.
5242 5231           */
5243 5232          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5244 5233                  ASSERT(!PP_ISFREE(tpp));
5245 5234                  ASSERT(tpp->p_vnode != NULL);
5246 5235  
5247 5236                  if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5248 5237                          break;
5249 5238                  ASSERT(tpp->p_szc == rootpp->p_szc);
5250 5239                  ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5251 5240          }
5252 5241  
5253 5242          /*
5254 5243           * If we failed to lock them all then unlock what we have
5255 5244           * locked so far and bail.
5256 5245           */
5257 5246          if (i < npgs) {
5258 5247                  tpp = rootpp;
5259 5248                  while (i-- > 0) {
5260 5249                          if (tpp != pp)
5261 5250                                  page_unlock(tpp);
5262 5251                          tpp++;
5263 5252                  }
5264 5253                  VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5265 5254                  return (0);
5266 5255          }
5267 5256  
5268 5257          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5269 5258                  ASSERT(PAGE_EXCL(tpp));
5270 5259                  ASSERT(tpp->p_slckcnt == 0);
5271 5260                  (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5272 5261                  tpp->p_szc = 0;
5273 5262          }
5274 5263  
5275 5264          /*
5276 5265           * Unlock all pages except the page passed in.
5277 5266           */
5278 5267          for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5279 5268                  ASSERT(!hat_page_is_mapped(tpp));
5280 5269                  if (tpp != pp)
5281 5270                          page_unlock(tpp);
5282 5271          }
5283 5272  
5284 5273          VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5285 5274          return (1);
5286 5275  }
5287 5276  
5288 5277  /*
5289 5278   * Called by page_free() and page_destroy() to demote the page size code
5290 5279   * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5291 5280   * p_szc on free list, neither can we just clear p_szc of a single page_t
5292 5281   * within a large page since it will break other code that relies on p_szc
5293 5282   * being the same for all page_t's of a large page). Anonymous pages should
5294 5283   * never end up here because anon_map_getpages() cannot deal with p_szc
5295 5284   * changes after a single constituent page is locked.  While anonymous or
5296 5285   * kernel large pages are demoted or freed the entire large page at a time
5297 5286   * with all constituent pages locked EXCL for the file system pages we
5298 5287   * have to be able to demote a large page (i.e. decrease all constituent pages
5299 5288   * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5300 5289   * we can easily deal with anonymous page demotion the entire large page at a
5301 5290   * time is that those operation originate at address space level and concern
5302 5291   * the entire large page region with actual demotion only done when pages are
5303 5292   * not shared with any other processes (therefore we can always get EXCL lock
5304 5293   * on all anonymous constituent pages after clearing segment page
5305 5294   * cache). However file system pages can be truncated or invalidated at a
5306 5295   * PAGESIZE level from the file system side and end up in page_free() or
5307 5296   * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5308 5297   * and therefore pageout should be able to demote a large page by EXCL locking
5309 5298   * any constituent page that is not under SOFTLOCK). In those cases we cannot
5310 5299   * rely on being able to lock EXCL all constituent pages.
5311 5300   *
5312 5301   * To prevent szc changes on file system pages one has to lock all constituent
5313 5302   * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5314 5303   * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5315 5304   * prevent szc changes is hat layer that uses its own page level mlist
5316 5305   * locks. hat assumes that szc doesn't change after mlist lock for a page is
5317 5306   * taken. Therefore we need to change szc under hat level locks if we only
5318 5307   * have an EXCL lock on a single constituent page and hat still references any
5319 5308   * of constituent pages.  (Note we can't "ignore" hat layer by simply
5320 5309   * hat_pageunload() all constituent pages without having EXCL locks on all of
5321 5310   * constituent pages). We use hat_page_demote() call to safely demote szc of
5322 5311   * all constituent pages under hat locks when we only have an EXCL lock on one
5323 5312   * of constituent pages.
5324 5313   *
5325 5314   * This routine calls page_szc_lock() before calling hat_page_demote() to
5326 5315   * allow segvn in one special case not to lock all constituent pages SHARED
5327 5316   * before calling hat_memload_array() that relies on p_szc not changing even
5328 5317   * before hat level mlist lock is taken.  In that case segvn uses
5329 5318   * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5330 5319   *
5331 5320   * Anonymous or kernel page demotion still has to lock all pages exclusively
5332 5321   * and do hat_pageunload() on all constituent pages before demoting the page
5333 5322   * therefore there's no need for anonymous or kernel page demotion to use
5334 5323   * hat_page_demote() mechanism.
5335 5324   *
5336 5325   * hat_page_demote() removes all large mappings that map pp and then decreases
5337 5326   * p_szc starting from the last constituent page of the large page. By working
5338 5327   * from the tail of a large page in pfn decreasing order allows one looking at
5339 5328   * the root page to know that hat_page_demote() is done for root's szc area.
5340 5329   * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5341 5330   * pages within szc 1 area to prevent szc changes because hat_page_demote()
5342 5331   * that started on this page when it had szc > 1 is done for this szc 1 area.
5343 5332   *
5344 5333   * We are guaranteed that all constituent pages of pp's large page belong to
5345 5334   * the same vnode with the consecutive offsets increasing in the direction of
5346 5335   * the pfn i.e. the identity of constituent pages can't change until their
5347 5336   * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5348 5337   * large mappings to pp even though we don't lock any constituent page except
5349 5338   * pp (i.e. we won't unload e.g. kernel locked page).
5350 5339   */
5351 5340  static void
5352 5341  page_demote_vp_pages(page_t *pp)
5353 5342  {
5354 5343          kmutex_t *mtx;
5355 5344  
5356 5345          ASSERT(PAGE_EXCL(pp));
5357 5346          ASSERT(!PP_ISFREE(pp));
5358 5347          ASSERT(pp->p_vnode != NULL);
5359 5348          ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5360 5349          ASSERT(!PP_ISKAS(pp));
5361 5350  
5362 5351          VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5363 5352  
5364 5353          mtx = page_szc_lock(pp);
5365 5354          if (mtx != NULL) {
5366 5355                  hat_page_demote(pp);
5367 5356                  mutex_exit(mtx);
5368 5357          }
5369 5358          ASSERT(pp->p_szc == 0);
5370 5359  }
5371 5360  
5372 5361  /*
5373 5362   * Mark any existing pages for migration in the given range
5374 5363   */
5375 5364  void
5376 5365  page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5377 5366      struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5378 5367      u_offset_t vnoff, int rflag)
5379 5368  {
5380 5369          struct anon     *ap;
5381 5370          vnode_t         *curvp;
5382 5371          lgrp_t          *from;
5383 5372          pgcnt_t         nlocked;
5384 5373          u_offset_t      off;
5385 5374          pfn_t           pfn;
5386 5375          size_t          pgsz;
5387 5376          size_t          segpgsz;
5388 5377          pgcnt_t         pages;
5389 5378          uint_t          pszc;
5390 5379          page_t          *pp0, *pp;
5391 5380          caddr_t         va;
5392 5381          ulong_t         an_idx;
5393 5382          anon_sync_obj_t cookie;
5394 5383  
5395 5384          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5396 5385  
5397 5386          /*
5398 5387           * Don't do anything if don't need to do lgroup optimizations
5399 5388           * on this system
5400 5389           */
5401 5390          if (!lgrp_optimizations())
5402 5391                  return;
5403 5392  
5404 5393          /*
5405 5394           * Align address and length to (potentially large) page boundary
5406 5395           */
5407 5396          segpgsz = page_get_pagesize(seg->s_szc);
5408 5397          addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5409 5398          if (rflag)
5410 5399                  len = P2ROUNDUP(len, segpgsz);
5411 5400  
5412 5401          /*
5413 5402           * Do one (large) page at a time
5414 5403           */
5415 5404          va = addr;
5416 5405          while (va < addr + len) {
5417 5406                  /*
5418 5407                   * Lookup (root) page for vnode and offset corresponding to
5419 5408                   * this virtual address
5420 5409                   * Try anonmap first since there may be copy-on-write
5421 5410                   * pages, but initialize vnode pointer and offset using
5422 5411                   * vnode arguments just in case there isn't an amp.
5423 5412                   */
5424 5413                  curvp = vp;
5425 5414                  off = vnoff + va - seg->s_base;
5426 5415                  if (amp) {
5427 5416                          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5428 5417                          an_idx = anon_index + seg_page(seg, va);
5429 5418                          anon_array_enter(amp, an_idx, &cookie);
5430 5419                          ap = anon_get_ptr(amp->ahp, an_idx);
5431 5420                          if (ap)
5432 5421                                  swap_xlate(ap, &curvp, &off);
5433 5422                          anon_array_exit(&cookie);
5434 5423                          ANON_LOCK_EXIT(&amp->a_rwlock);
5435 5424                  }
5436 5425  
5437 5426                  pp = NULL;
5438 5427                  if (curvp)
5439 5428                          pp = page_lookup(curvp, off, SE_SHARED);
5440 5429  
5441 5430                  /*
5442 5431                   * If there isn't a page at this virtual address,
5443 5432                   * skip to next page
5444 5433                   */
5445 5434                  if (pp == NULL) {
5446 5435                          va += PAGESIZE;
5447 5436                          continue;
5448 5437                  }
5449 5438  
5450 5439                  /*
5451 5440                   * Figure out which lgroup this page is in for kstats
5452 5441                   */
5453 5442                  pfn = page_pptonum(pp);
5454 5443                  from = lgrp_pfn_to_lgrp(pfn);
5455 5444  
5456 5445                  /*
5457 5446                   * Get page size, and round up and skip to next page boundary
5458 5447                   * if unaligned address
5459 5448                   */
5460 5449                  pszc = pp->p_szc;
5461 5450                  pgsz = page_get_pagesize(pszc);
5462 5451                  pages = btop(pgsz);
5463 5452                  if (!IS_P2ALIGNED(va, pgsz) ||
5464 5453                      !IS_P2ALIGNED(pfn, pages) ||
5465 5454                      pgsz > segpgsz) {
5466 5455                          pgsz = MIN(pgsz, segpgsz);
5467 5456                          page_unlock(pp);
5468 5457                          pages = btop(P2END((uintptr_t)va, pgsz) -
5469 5458                              (uintptr_t)va);
5470 5459                          va = (caddr_t)P2END((uintptr_t)va, pgsz);
5471 5460                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5472 5461                          continue;
5473 5462                  }
5474 5463  
5475 5464                  /*
5476 5465                   * Upgrade to exclusive lock on page
5477 5466                   */
5478 5467                  if (!page_tryupgrade(pp)) {
5479 5468                          page_unlock(pp);
5480 5469                          va += pgsz;
5481 5470                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5482 5471                              btop(pgsz));
5483 5472                          continue;
5484 5473                  }
5485 5474  
5486 5475                  pp0 = pp++;
5487 5476                  nlocked = 1;
5488 5477  
5489 5478                  /*
5490 5479                   * Lock constituent pages if this is large page
5491 5480                   */
5492 5481                  if (pages > 1) {
5493 5482                          /*
5494 5483                           * Lock all constituents except root page, since it
5495 5484                           * should be locked already.
5496 5485                           */
5497 5486                          for (; nlocked < pages; nlocked++) {
5498 5487                                  if (!page_trylock(pp, SE_EXCL)) {
5499 5488                                          break;
5500 5489                                  }
5501 5490                                  if (PP_ISFREE(pp) ||
5502 5491                                      pp->p_szc != pszc) {
5503 5492                                          /*
5504 5493                                           * hat_page_demote() raced in with us.
5505 5494                                           */
5506 5495                                          ASSERT(!IS_SWAPFSVP(curvp));
5507 5496                                          page_unlock(pp);
5508 5497                                          break;
5509 5498                                  }
5510 5499                                  pp++;
5511 5500                          }
5512 5501                  }
5513 5502  
5514 5503                  /*
5515 5504                   * If all constituent pages couldn't be locked,
5516 5505                   * unlock pages locked so far and skip to next page.
5517 5506                   */
5518 5507                  if (nlocked < pages) {
5519 5508                          while (pp0 < pp) {
5520 5509                                  page_unlock(pp0++);
5521 5510                          }
5522 5511                          va += pgsz;
5523 5512                          lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5524 5513                              btop(pgsz));
5525 5514                          continue;
5526 5515                  }
5527 5516  
5528 5517                  /*
5529 5518                   * hat_page_demote() can no longer happen
5530 5519                   * since last cons page had the right p_szc after
5531 5520                   * all cons pages were locked. all cons pages
5532 5521                   * should now have the same p_szc.
5533 5522                   */
5534 5523  
5535 5524                  /*
5536 5525                   * All constituent pages locked successfully, so mark
5537 5526                   * large page for migration and unload the mappings of
5538 5527                   * constituent pages, so a fault will occur on any part of the
5539 5528                   * large page
5540 5529                   */
5541 5530                  PP_SETMIGRATE(pp0);
5542 5531                  while (pp0 < pp) {
5543 5532                          (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5544 5533                          ASSERT(hat_page_getshare(pp0) == 0);
5545 5534                          page_unlock(pp0++);
5546 5535                  }
5547 5536                  lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5548 5537  
5549 5538                  va += pgsz;
5550 5539          }
5551 5540  }
5552 5541  
5553 5542  /*
5554 5543   * Migrate any pages that have been marked for migration in the given range
5555 5544   */
5556 5545  void
5557 5546  page_migrate(
5558 5547          struct seg      *seg,
5559 5548          caddr_t         addr,
5560 5549          page_t          **ppa,
5561 5550          pgcnt_t         npages)
5562 5551  {
5563 5552          lgrp_t          *from;
5564 5553          lgrp_t          *to;
5565 5554          page_t          *newpp;
5566 5555          page_t          *pp;
5567 5556          pfn_t           pfn;
5568 5557          size_t          pgsz;
5569 5558          spgcnt_t        page_cnt;
5570 5559          spgcnt_t        i;
5571 5560          uint_t          pszc;
5572 5561  
5573 5562          ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5574 5563  
5575 5564          while (npages > 0) {
5576 5565                  pp = *ppa;
5577 5566                  pszc = pp->p_szc;
5578 5567                  pgsz = page_get_pagesize(pszc);
5579 5568                  page_cnt = btop(pgsz);
5580 5569  
5581 5570                  /*
5582 5571                   * Check to see whether this page is marked for migration
5583 5572                   *
5584 5573                   * Assume that root page of large page is marked for
5585 5574                   * migration and none of the other constituent pages
5586 5575                   * are marked.  This really simplifies clearing the
5587 5576                   * migrate bit by not having to clear it from each
5588 5577                   * constituent page.
5589 5578                   *
5590 5579                   * note we don't want to relocate an entire large page if
5591 5580                   * someone is only using one subpage.
5592 5581                   */
5593 5582                  if (npages < page_cnt)
5594 5583                          break;
5595 5584  
5596 5585                  /*
5597 5586                   * Is it marked for migration?
5598 5587                   */
5599 5588                  if (!PP_ISMIGRATE(pp))
5600 5589                          goto next;
5601 5590  
5602 5591                  /*
5603 5592                   * Determine lgroups that page is being migrated between
5604 5593                   */
5605 5594                  pfn = page_pptonum(pp);
5606 5595                  if (!IS_P2ALIGNED(pfn, page_cnt)) {
5607 5596                          break;
5608 5597                  }
5609 5598                  from = lgrp_pfn_to_lgrp(pfn);
5610 5599                  to = lgrp_mem_choose(seg, addr, pgsz);
5611 5600  
5612 5601                  /*
5613 5602                   * Need to get exclusive lock's to migrate
5614 5603                   */
5615 5604                  for (i = 0; i < page_cnt; i++) {
5616 5605                          ASSERT(PAGE_LOCKED(ppa[i]));
5617 5606                          if (page_pptonum(ppa[i]) != pfn + i ||
5618 5607                              ppa[i]->p_szc != pszc) {
5619 5608                                  break;
5620 5609                          }
5621 5610                          if (!page_tryupgrade(ppa[i])) {
5622 5611                                  lgrp_stat_add(from->lgrp_id,
5623 5612                                      LGRP_PM_FAIL_LOCK_PGS,
5624 5613                                      page_cnt);
5625 5614                                  break;
5626 5615                          }
5627 5616  
5628 5617                          /*
5629 5618                           * Check to see whether we are trying to migrate
5630 5619                           * page to lgroup where it is allocated already.
5631 5620                           * If so, clear the migrate bit and skip to next
5632 5621                           * page.
5633 5622                           */
5634 5623                          if (i == 0 && to == from) {
5635 5624                                  PP_CLRMIGRATE(ppa[0]);
5636 5625                                  page_downgrade(ppa[0]);
5637 5626                                  goto next;
5638 5627                          }
5639 5628                  }
5640 5629  
5641 5630                  /*
5642 5631                   * If all constituent pages couldn't be locked,
5643 5632                   * unlock pages locked so far and skip to next page.
5644 5633                   */
5645 5634                  if (i != page_cnt) {
5646 5635                          while (--i != -1) {
5647 5636                                  page_downgrade(ppa[i]);
5648 5637                          }
5649 5638                          goto next;
5650 5639                  }
5651 5640  
5652 5641                  (void) page_create_wait(page_cnt, PG_WAIT);
5653 5642                  newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5654 5643                  if (newpp == NULL) {
5655 5644                          page_create_putback(page_cnt);
5656 5645                          for (i = 0; i < page_cnt; i++) {
5657 5646                                  page_downgrade(ppa[i]);
5658 5647                          }
5659 5648                          lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5660 5649                              page_cnt);
5661 5650                          goto next;
5662 5651                  }
5663 5652                  ASSERT(newpp->p_szc == pszc);
5664 5653                  /*
5665 5654                   * Clear migrate bit and relocate page
5666 5655                   */
5667 5656                  PP_CLRMIGRATE(pp);
5668 5657                  if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5669 5658                          panic("page_migrate: page_relocate failed");
5670 5659                  }
5671 5660                  ASSERT(page_cnt * PAGESIZE == pgsz);
5672 5661  
5673 5662                  /*
5674 5663                   * Keep stats for number of pages migrated from and to
5675 5664                   * each lgroup
5676 5665                   */
5677 5666                  lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5678 5667                  lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5679 5668                  /*
5680 5669                   * update the page_t array we were passed in and
5681 5670                   * unlink constituent pages of a large page.
5682 5671                   */
5683 5672                  for (i = 0; i < page_cnt; ++i, ++pp) {
5684 5673                          ASSERT(PAGE_EXCL(newpp));
5685 5674                          ASSERT(newpp->p_szc == pszc);
5686 5675                          ppa[i] = newpp;
5687 5676                          pp = newpp;
5688 5677                          page_sub(&newpp, pp);
5689 5678                          page_downgrade(pp);
5690 5679                  }
5691 5680                  ASSERT(newpp == NULL);
5692 5681  next:
5693 5682                  addr += pgsz;
5694 5683                  ppa += page_cnt;
5695 5684                  npages -= page_cnt;
5696 5685          }
5697 5686  }
5698 5687  
5699 5688  #define MAX_CNT 60      /* max num of iterations */
5700 5689  /*
5701 5690   * Reclaim/reserve availrmem for npages.
5702 5691   * If there is not enough memory start reaping seg, kmem caches.
5703 5692   * Start pageout scanner (via page_needfree()).
5704 5693   * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5705 5694   * Note: There is no guarantee that any availrmem will be freed as
5706 5695   * this memory typically is locked (kernel heap) or reserved for swap.
5707 5696   * Also due to memory fragmentation kmem allocator may not be able
5708 5697   * to free any memory (single user allocated buffer will prevent
5709 5698   * freeing slab or a page).
5710 5699   */
5711 5700  int
5712 5701  page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5713 5702  {
5714 5703          int     i = 0;
5715 5704          int     ret = 0;
5716 5705          pgcnt_t deficit;
5717 5706          pgcnt_t old_availrmem;
5718 5707  
5719 5708          mutex_enter(&freemem_lock);
5720 5709          old_availrmem = availrmem - 1;
5721 5710          while ((availrmem < tune.t_minarmem + npages + epages) &&
5722 5711              (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5723 5712                  old_availrmem = availrmem;
5724 5713                  deficit = tune.t_minarmem + npages + epages - availrmem;
5725 5714                  mutex_exit(&freemem_lock);
5726 5715                  page_needfree(deficit);
5727 5716                  kmem_reap();
5728 5717                  delay(hz);
5729 5718                  page_needfree(-(spgcnt_t)deficit);
5730 5719                  mutex_enter(&freemem_lock);
5731 5720          }
5732 5721  
5733 5722          if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5734 5723                  availrmem -= npages;
5735 5724                  ret = 1;
5736 5725          }
5737 5726  
5738 5727          mutex_exit(&freemem_lock);
5739 5728  
5740 5729          return (ret);
5741 5730  }
5742 5731  
5743 5732  /*
5744 5733   * Search the memory segments to locate the desired page.  Within a
5745 5734   * segment, pages increase linearly with one page structure per
5746 5735   * physical page frame (size PAGESIZE).  The search begins
5747 5736   * with the segment that was accessed last, to take advantage of locality.
5748 5737   * If the hint misses, we start from the beginning of the sorted memseg list
5749 5738   */
5750 5739  
5751 5740  
5752 5741  /*
5753 5742   * Some data structures for pfn to pp lookup.
5754 5743   */
5755 5744  ulong_t mhash_per_slot;
5756 5745  struct memseg *memseg_hash[N_MEM_SLOTS];
5757 5746  
5758 5747  page_t *
5759 5748  page_numtopp_nolock(pfn_t pfnum)
5760 5749  {
5761 5750          struct memseg *seg;
5762 5751          page_t *pp;
5763 5752          vm_cpu_data_t *vc;
5764 5753  
5765 5754          /*
5766 5755           * We need to disable kernel preemption while referencing the
5767 5756           * cpu_vm_data field in order to prevent us from being switched to
5768 5757           * another cpu and trying to reference it after it has been freed.
5769 5758           * This will keep us on cpu and prevent it from being removed while
5770 5759           * we are still on it.
5771 5760           *
5772 5761           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5773 5762           * which is being resued by DR who will flush those references
5774 5763           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5775 5764           */
5776 5765          kpreempt_disable();
5777 5766          vc = CPU->cpu_vm_data;
5778 5767          ASSERT(vc != NULL);
5779 5768  
5780 5769          MEMSEG_STAT_INCR(nsearch);
5781 5770  
5782 5771          /* Try last winner first */
5783 5772          if (((seg = vc->vc_pnum_memseg) != NULL) &&
5784 5773              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5785 5774                  MEMSEG_STAT_INCR(nlastwon);
5786 5775                  pp = seg->pages + (pfnum - seg->pages_base);
5787 5776                  if (pp->p_pagenum == pfnum) {
5788 5777                          kpreempt_enable();
5789 5778                          return ((page_t *)pp);
5790 5779                  }
5791 5780          }
5792 5781  
5793 5782          /* Else Try hash */
5794 5783          if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5795 5784              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5796 5785                  MEMSEG_STAT_INCR(nhashwon);
5797 5786                  vc->vc_pnum_memseg = seg;
5798 5787                  pp = seg->pages + (pfnum - seg->pages_base);
5799 5788                  if (pp->p_pagenum == pfnum) {
5800 5789                          kpreempt_enable();
5801 5790                          return ((page_t *)pp);
5802 5791                  }
5803 5792          }
5804 5793  
5805 5794          /* Else Brute force */
5806 5795          for (seg = memsegs; seg != NULL; seg = seg->next) {
5807 5796                  if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5808 5797                          vc->vc_pnum_memseg = seg;
5809 5798                          pp = seg->pages + (pfnum - seg->pages_base);
5810 5799                          if (pp->p_pagenum == pfnum) {
5811 5800                                  kpreempt_enable();
5812 5801                                  return ((page_t *)pp);
5813 5802                          }
5814 5803                  }
5815 5804          }
5816 5805          vc->vc_pnum_memseg = NULL;
5817 5806          kpreempt_enable();
5818 5807          MEMSEG_STAT_INCR(nnotfound);
5819 5808          return ((page_t *)NULL);
5820 5809  
5821 5810  }
5822 5811  
5823 5812  struct memseg *
5824 5813  page_numtomemseg_nolock(pfn_t pfnum)
5825 5814  {
5826 5815          struct memseg *seg;
5827 5816          page_t *pp;
5828 5817  
5829 5818          /*
5830 5819           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5831 5820           * which is being resued by DR who will flush those references
5832 5821           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5833 5822           */
5834 5823          kpreempt_disable();
5835 5824          /* Try hash */
5836 5825          if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5837 5826              (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5838 5827                  pp = seg->pages + (pfnum - seg->pages_base);
5839 5828                  if (pp->p_pagenum == pfnum) {
5840 5829                          kpreempt_enable();
5841 5830                          return (seg);
5842 5831                  }
5843 5832          }
5844 5833  
5845 5834          /* Else Brute force */
5846 5835          for (seg = memsegs; seg != NULL; seg = seg->next) {
5847 5836                  if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5848 5837                          pp = seg->pages + (pfnum - seg->pages_base);
5849 5838                          if (pp->p_pagenum == pfnum) {
5850 5839                                  kpreempt_enable();
5851 5840                                  return (seg);
5852 5841                          }
5853 5842                  }
5854 5843          }
5855 5844          kpreempt_enable();
5856 5845          return ((struct memseg *)NULL);
5857 5846  }
5858 5847  
5859 5848  /*
5860 5849   * Given a page and a count return the page struct that is
5861 5850   * n structs away from the current one in the global page
5862 5851   * list.
5863 5852   *
5864 5853   * This function wraps to the first page upon
5865 5854   * reaching the end of the memseg list.
5866 5855   */
5867 5856  page_t *
5868 5857  page_nextn(page_t *pp, ulong_t n)
5869 5858  {
5870 5859          struct memseg *seg;
5871 5860          page_t *ppn;
5872 5861          vm_cpu_data_t *vc;
5873 5862  
5874 5863          /*
5875 5864           * We need to disable kernel preemption while referencing the
5876 5865           * cpu_vm_data field in order to prevent us from being switched to
5877 5866           * another cpu and trying to reference it after it has been freed.
5878 5867           * This will keep us on cpu and prevent it from being removed while
5879 5868           * we are still on it.
5880 5869           *
5881 5870           * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5882 5871           * which is being resued by DR who will flush those references
5883 5872           * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5884 5873           */
5885 5874          kpreempt_disable();
5886 5875          vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5887 5876  
5888 5877          ASSERT(vc != NULL);
5889 5878  
5890 5879          if (((seg = vc->vc_pnext_memseg) == NULL) ||
5891 5880              (seg->pages_base == seg->pages_end) ||
5892 5881              !(pp >= seg->pages && pp < seg->epages)) {
5893 5882  
5894 5883                  for (seg = memsegs; seg; seg = seg->next) {
5895 5884                          if (pp >= seg->pages && pp < seg->epages)
5896 5885                                  break;
5897 5886                  }
5898 5887  
5899 5888                  if (seg == NULL) {
5900 5889                          /* Memory delete got in, return something valid. */
5901 5890                          /* TODO: fix me. */
5902 5891                          seg = memsegs;
5903 5892                          pp = seg->pages;
5904 5893                  }
5905 5894          }
5906 5895  
5907 5896          /* check for wraparound - possible if n is large */
5908 5897          while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5909 5898                  n -= seg->epages - pp;
5910 5899                  seg = seg->next;
5911 5900                  if (seg == NULL)
5912 5901                          seg = memsegs;
5913 5902                  pp = seg->pages;
5914 5903          }
5915 5904          vc->vc_pnext_memseg = seg;
5916 5905          kpreempt_enable();
5917 5906          return (ppn);
5918 5907  }
5919 5908  
5920 5909  /*
5921 5910   * Initialize for a loop using page_next_scan_large().
5922 5911   */
5923 5912  page_t *
5924 5913  page_next_scan_init(void **cookie)
5925 5914  {
5926 5915          ASSERT(cookie != NULL);
5927 5916          *cookie = (void *)memsegs;
5928 5917          return ((page_t *)memsegs->pages);
5929 5918  }
5930 5919  
5931 5920  /*
5932 5921   * Return the next page in a scan of page_t's, assuming we want
5933 5922   * to skip over sub-pages within larger page sizes.
5934 5923   *
5935 5924   * The cookie is used to keep track of the current memseg.
5936 5925   */
5937 5926  page_t *
5938 5927  page_next_scan_large(
5939 5928          page_t          *pp,
5940 5929          ulong_t         *n,
5941 5930          void            **cookie)
5942 5931  {
5943 5932          struct memseg   *seg = (struct memseg *)*cookie;
5944 5933          page_t          *new_pp;
5945 5934          ulong_t         cnt;
5946 5935          pfn_t           pfn;
5947 5936  
5948 5937  
5949 5938          /*
5950 5939           * get the count of page_t's to skip based on the page size
5951 5940           */
5952 5941          ASSERT(pp != NULL);
5953 5942          if (pp->p_szc == 0) {
5954 5943                  cnt = 1;
5955 5944          } else {
5956 5945                  pfn = page_pptonum(pp);
5957 5946                  cnt = page_get_pagecnt(pp->p_szc);
5958 5947                  cnt -= pfn & (cnt - 1);
5959 5948          }
5960 5949          *n += cnt;
5961 5950          new_pp = pp + cnt;
5962 5951  
5963 5952          /*
5964 5953           * Catch if we went past the end of the current memory segment. If so,
5965 5954           * just move to the next segment with pages.
5966 5955           */
5967 5956          if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5968 5957                  do {
5969 5958                          seg = seg->next;
5970 5959                          if (seg == NULL)
5971 5960                                  seg = memsegs;
5972 5961                  } while (seg->pages_base == seg->pages_end);
5973 5962                  new_pp = seg->pages;
5974 5963                  *cookie = (void *)seg;
5975 5964          }
5976 5965  
5977 5966          return (new_pp);
5978 5967  }
5979 5968  
5980 5969  
5981 5970  /*
5982 5971   * Returns next page in list. Note: this function wraps
5983 5972   * to the first page in the list upon reaching the end
5984 5973   * of the list. Callers should be aware of this fact.
5985 5974   */
5986 5975  
5987 5976  /* We should change this be a #define */
5988 5977  
5989 5978  page_t *
5990 5979  page_next(page_t *pp)
5991 5980  {
5992 5981          return (page_nextn(pp, 1));
5993 5982  }
5994 5983  
5995 5984  page_t *
5996 5985  page_first()
5997 5986  {
5998 5987          return ((page_t *)memsegs->pages);
5999 5988  }
6000 5989  
6001 5990  
6002 5991  /*
6003 5992   * This routine is called at boot with the initial memory configuration
6004 5993   * and when memory is added or removed.
6005 5994   */
6006 5995  void
6007 5996  build_pfn_hash()
6008 5997  {
6009 5998          pfn_t cur;
6010 5999          pgcnt_t index;
6011 6000          struct memseg *pseg;
6012 6001          int     i;
6013 6002  
6014 6003          /*
6015 6004           * Clear memseg_hash array.
6016 6005           * Since memory add/delete is designed to operate concurrently
6017 6006           * with normal operation, the hash rebuild must be able to run
6018 6007           * concurrently with page_numtopp_nolock(). To support this
6019 6008           * functionality, assignments to memseg_hash array members must
6020 6009           * be done atomically.
6021 6010           *
6022 6011           * NOTE: bzero() does not currently guarantee this for kernel
6023 6012           * threads, and cannot be used here.
6024 6013           */
6025 6014          for (i = 0; i < N_MEM_SLOTS; i++)
6026 6015                  memseg_hash[i] = NULL;
6027 6016  
6028 6017          hat_kpm_mseghash_clear(N_MEM_SLOTS);
6029 6018  
6030 6019          /*
6031 6020           * Physmax is the last valid pfn.
6032 6021           */
6033 6022          mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6034 6023          for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6035 6024                  index = MEMSEG_PFN_HASH(pseg->pages_base);
6036 6025                  cur = pseg->pages_base;
6037 6026                  do {
6038 6027                          if (index >= N_MEM_SLOTS)
6039 6028                                  index = MEMSEG_PFN_HASH(cur);
6040 6029  
6041 6030                          if (memseg_hash[index] == NULL ||
6042 6031                              memseg_hash[index]->pages_base > pseg->pages_base) {
6043 6032                                  memseg_hash[index] = pseg;
6044 6033                                  hat_kpm_mseghash_update(index, pseg);
6045 6034                          }
6046 6035                          cur += mhash_per_slot;
6047 6036                          index++;
6048 6037                  } while (cur < pseg->pages_end);
6049 6038          }
6050 6039  }
6051 6040  
6052 6041  /*
6053 6042   * Return the pagenum for the pp
6054 6043   */
6055 6044  pfn_t
6056 6045  page_pptonum(page_t *pp)
6057 6046  {
6058 6047          return (pp->p_pagenum);
6059 6048  }
6060 6049  
6061 6050  /*
6062 6051   * interface to the referenced and modified etc bits
6063 6052   * in the PSM part of the page struct
6064 6053   * when no locking is desired.
6065 6054   */
6066 6055  void
6067 6056  page_set_props(page_t *pp, uint_t flags)
6068 6057  {
6069 6058          ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6070 6059          pp->p_nrm |= (uchar_t)flags;
6071 6060  }
6072 6061  
6073 6062  void
6074 6063  page_clr_all_props(page_t *pp)
6075 6064  {
6076 6065          pp->p_nrm = 0;
6077 6066  }
6078 6067  
6079 6068  /*
6080 6069   * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6081 6070   */
6082 6071  int
6083 6072  page_clear_lck_cow(page_t *pp, int adjust)
6084 6073  {
6085 6074          int     f_amount;
6086 6075  
6087 6076          ASSERT(PAGE_EXCL(pp));
6088 6077  
6089 6078          /*
6090 6079           * The page_struct_lock need not be acquired here since
6091 6080           * we require the caller hold the page exclusively locked.
6092 6081           */
6093 6082          f_amount = 0;
6094 6083          if (pp->p_lckcnt) {
6095 6084                  f_amount = 1;
6096 6085                  pp->p_lckcnt = 0;
6097 6086          }
6098 6087          if (pp->p_cowcnt) {
6099 6088                  f_amount += pp->p_cowcnt;
6100 6089                  pp->p_cowcnt = 0;
6101 6090          }
6102 6091  
6103 6092          if (adjust && f_amount) {
6104 6093                  mutex_enter(&freemem_lock);
6105 6094                  availrmem += f_amount;
6106 6095                  mutex_exit(&freemem_lock);
6107 6096          }
6108 6097  
6109 6098          return (f_amount);
6110 6099  }
6111 6100  
6112 6101  /*
6113 6102   * The following functions is called from free_vp_pages()
6114 6103   * for an inexact estimate of a newly free'd page...
6115 6104   */
6116 6105  ulong_t
6117 6106  page_share_cnt(page_t *pp)
6118 6107  {
6119 6108          return (hat_page_getshare(pp));
6120 6109  }
6121 6110  
6122 6111  int
6123 6112  page_isshared(page_t *pp)
6124 6113  {
6125 6114          return (hat_page_checkshare(pp, 1));
6126 6115  }
6127 6116  
6128 6117  int
6129 6118  page_isfree(page_t *pp)
6130 6119  {
6131 6120          return (PP_ISFREE(pp));
6132 6121  }
6133 6122  
6134 6123  int
6135 6124  page_isref(page_t *pp)
6136 6125  {
6137 6126          return (hat_page_getattr(pp, P_REF));
6138 6127  }
6139 6128  
6140 6129  int
6141 6130  page_ismod(page_t *pp)
6142 6131  {
6143 6132          return (hat_page_getattr(pp, P_MOD));
6144 6133  }
6145 6134  
6146 6135  /*
6147 6136   * The following code all currently relates to the page capture logic:
6148 6137   *
6149 6138   * This logic is used for cases where there is a desire to claim a certain
6150 6139   * physical page in the system for the caller.  As it may not be possible
6151 6140   * to capture the page immediately, the p_toxic bits are used in the page
6152 6141   * structure to indicate that someone wants to capture this page.  When the
6153 6142   * page gets unlocked, the toxic flag will be noted and an attempt to capture
6154 6143   * the page will be made.  If it is successful, the original callers callback
6155 6144   * will be called with the page to do with it what they please.
6156 6145   *
6157 6146   * There is also an async thread which wakes up to attempt to capture
6158 6147   * pages occasionally which have the capture bit set.  All of the pages which
6159 6148   * need to be captured asynchronously have been inserted into the
6160 6149   * page_capture_hash and thus this thread walks that hash list.  Items in the
6161 6150   * hash have an expiration time so this thread handles that as well by removing
6162 6151   * the item from the hash if it has expired.
6163 6152   *
6164 6153   * Some important things to note are:
6165 6154   * - if the PR_CAPTURE bit is set on a page, then the page is in the
6166 6155   *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
6167 6156   *   to set and clear this bit, and while the lock is held is the only time
6168 6157   *   you can add or remove an entry from the hash.
6169 6158   * - the PR_CAPTURE bit can only be set and cleared while holding the
6170 6159   *   page_capture_hash_head.pchh_mutex
6171 6160   * - the t_flag field of the thread struct is used with the T_CAPTURING
6172 6161   *   flag to prevent recursion while dealing with large pages.
6173 6162   * - pages which need to be retired never expire on the page_capture_hash.
6174 6163   */
6175 6164  
6176 6165  static void page_capture_thread(void);
6177 6166  static kthread_t *pc_thread_id;
6178 6167  kcondvar_t pc_cv;
6179 6168  static kmutex_t pc_thread_mutex;
6180 6169  static clock_t pc_thread_shortwait;
6181 6170  static clock_t pc_thread_longwait;
6182 6171  static int pc_thread_retry;
6183 6172  
6184 6173  struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6185 6174  
6186 6175  /* Note that this is a circular linked list */
6187 6176  typedef struct page_capture_hash_bucket {
6188 6177          page_t *pp;
6189 6178          uchar_t szc;
6190 6179          uchar_t pri;
6191 6180          uint_t flags;
6192 6181          clock_t expires;        /* lbolt at which this request expires. */
6193 6182          void *datap;            /* Cached data passed in for callback */
6194 6183          struct page_capture_hash_bucket *next;
6195 6184          struct page_capture_hash_bucket *prev;
6196 6185  } page_capture_hash_bucket_t;
6197 6186  
6198 6187  #define PC_PRI_HI       0       /* capture now */
6199 6188  #define PC_PRI_LO       1       /* capture later */
6200 6189  #define PC_NUM_PRI      2
6201 6190  
6202 6191  #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6203 6192  
6204 6193  
6205 6194  /*
6206 6195   * Each hash bucket will have it's own mutex and two lists which are:
6207 6196   * active (0):  represents requests which have not been processed by
6208 6197   *              the page_capture async thread yet.
6209 6198   * walked (1):  represents requests which have been processed by the
6210 6199   *              page_capture async thread within it's given walk of this bucket.
6211 6200   *
6212 6201   * These are all needed so that we can synchronize all async page_capture
6213 6202   * events.  When the async thread moves to a new bucket, it will append the
6214 6203   * walked list to the active list and walk each item one at a time, moving it
6215 6204   * from the active list to the walked list.  Thus if there is an async request
6216 6205   * outstanding for a given page, it will always be in one of the two lists.
6217 6206   * New requests will always be added to the active list.
6218 6207   * If we were not able to capture a page before the request expired, we'd free
6219 6208   * up the request structure which would indicate to page_capture that there is
6220 6209   * no longer a need for the given page, and clear the PR_CAPTURE flag if
6221 6210   * possible.
6222 6211   */
6223 6212  typedef struct page_capture_hash_head {
6224 6213          kmutex_t pchh_mutex;
6225 6214          uint_t num_pages[PC_NUM_PRI];
6226 6215          page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6227 6216  } page_capture_hash_head_t;
6228 6217  
6229 6218  #ifdef DEBUG
6230 6219  #define NUM_PAGE_CAPTURE_BUCKETS 4
6231 6220  #else
6232 6221  #define NUM_PAGE_CAPTURE_BUCKETS 64
6233 6222  #endif
6234 6223  
6235 6224  page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6236 6225  
6237 6226  /* for now use a very simple hash based upon the size of a page struct */
6238 6227  #define PAGE_CAPTURE_HASH(pp)   \
6239 6228          ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6240 6229  
6241 6230  extern pgcnt_t swapfs_minfree;
6242 6231  
6243 6232  int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6244 6233  
6245 6234  /*
6246 6235   * a callback function is required for page capture requests.
6247 6236   */
6248 6237  void
6249 6238  page_capture_register_callback(uint_t index, clock_t duration,
6250 6239      int (*cb_func)(page_t *, void *, uint_t))
6251 6240  {
6252 6241          ASSERT(pc_cb[index].cb_active == 0);
6253 6242          ASSERT(cb_func != NULL);
6254 6243          rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6255 6244          pc_cb[index].duration = duration;
6256 6245          pc_cb[index].cb_func = cb_func;
6257 6246          pc_cb[index].cb_active = 1;
6258 6247          rw_exit(&pc_cb[index].cb_rwlock);
6259 6248  }
6260 6249  
6261 6250  void
6262 6251  page_capture_unregister_callback(uint_t index)
6263 6252  {
6264 6253          int i, j;
6265 6254          struct page_capture_hash_bucket *bp1;
6266 6255          struct page_capture_hash_bucket *bp2;
6267 6256          struct page_capture_hash_bucket *head = NULL;
6268 6257          uint_t flags = (1 << index);
6269 6258  
6270 6259          rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6271 6260          ASSERT(pc_cb[index].cb_active == 1);
6272 6261          pc_cb[index].duration = 0;      /* Paranoia */
6273 6262          pc_cb[index].cb_func = NULL;    /* Paranoia */
6274 6263          pc_cb[index].cb_active = 0;
6275 6264          rw_exit(&pc_cb[index].cb_rwlock);
6276 6265  
6277 6266          /*
6278 6267           * Just move all the entries to a private list which we can walk
6279 6268           * through without the need to hold any locks.
6280 6269           * No more requests can get added to the hash lists for this consumer
6281 6270           * as the cb_active field for the callback has been cleared.
6282 6271           */
6283 6272          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6284 6273                  mutex_enter(&page_capture_hash[i].pchh_mutex);
6285 6274                  for (j = 0; j < 2; j++) {
6286 6275                          bp1 = page_capture_hash[i].lists[j].next;
6287 6276                          /* walk through all but first (sentinel) element */
6288 6277                          while (bp1 != &page_capture_hash[i].lists[j]) {
6289 6278                                  bp2 = bp1;
6290 6279                                  if (bp2->flags & flags) {
6291 6280                                          bp1 = bp2->next;
6292 6281                                          bp1->prev = bp2->prev;
6293 6282                                          bp2->prev->next = bp1;
6294 6283                                          bp2->next = head;
6295 6284                                          head = bp2;
6296 6285                                          /*
6297 6286                                           * Clear the PR_CAPTURE bit as we
6298 6287                                           * hold appropriate locks here.
6299 6288                                           */
6300 6289                                          page_clrtoxic(head->pp, PR_CAPTURE);
6301 6290                                          page_capture_hash[i].
6302 6291                                              num_pages[bp2->pri]--;
6303 6292                                          continue;
6304 6293                                  }
6305 6294                                  bp1 = bp1->next;
6306 6295                          }
6307 6296                  }
6308 6297                  mutex_exit(&page_capture_hash[i].pchh_mutex);
6309 6298          }
6310 6299  
6311 6300          while (head != NULL) {
6312 6301                  bp1 = head;
6313 6302                  head = head->next;
6314 6303                  kmem_free(bp1, sizeof (*bp1));
6315 6304          }
6316 6305  }
6317 6306  
6318 6307  
6319 6308  /*
6320 6309   * Find pp in the active list and move it to the walked list if it
6321 6310   * exists.
6322 6311   * Note that most often pp should be at the front of the active list
6323 6312   * as it is currently used and thus there is no other sort of optimization
6324 6313   * being done here as this is a linked list data structure.
6325 6314   * Returns 1 on successful move or 0 if page could not be found.
6326 6315   */
6327 6316  static int
6328 6317  page_capture_move_to_walked(page_t *pp)
6329 6318  {
6330 6319          page_capture_hash_bucket_t *bp;
6331 6320          int index;
6332 6321  
6333 6322          index = PAGE_CAPTURE_HASH(pp);
6334 6323  
6335 6324          mutex_enter(&page_capture_hash[index].pchh_mutex);
6336 6325          bp = page_capture_hash[index].lists[0].next;
6337 6326          while (bp != &page_capture_hash[index].lists[0]) {
6338 6327                  if (bp->pp == pp) {
6339 6328                          /* Remove from old list */
6340 6329                          bp->next->prev = bp->prev;
6341 6330                          bp->prev->next = bp->next;
6342 6331  
6343 6332                          /* Add to new list */
6344 6333                          bp->next = page_capture_hash[index].lists[1].next;
6345 6334                          bp->prev = &page_capture_hash[index].lists[1];
6346 6335                          page_capture_hash[index].lists[1].next = bp;
6347 6336                          bp->next->prev = bp;
6348 6337  
6349 6338                          /*
6350 6339                           * There is a small probability of page on a free
6351 6340                           * list being retired while being allocated
6352 6341                           * and before P_RAF is set on it. The page may
6353 6342                           * end up marked as high priority request instead
6354 6343                           * of low priority request.
6355 6344                           * If P_RAF page is not marked as low priority request
6356 6345                           * change it to low priority request.
6357 6346                           */
6358 6347                          page_capture_hash[index].num_pages[bp->pri]--;
6359 6348                          bp->pri = PAGE_CAPTURE_PRIO(pp);
6360 6349                          page_capture_hash[index].num_pages[bp->pri]++;
6361 6350                          mutex_exit(&page_capture_hash[index].pchh_mutex);
6362 6351                          return (1);
6363 6352                  }
6364 6353                  bp = bp->next;
6365 6354          }
6366 6355          mutex_exit(&page_capture_hash[index].pchh_mutex);
6367 6356          return (0);
6368 6357  }
6369 6358  
6370 6359  /*
6371 6360   * Add a new entry to the page capture hash.  The only case where a new
6372 6361   * entry is not added is when the page capture consumer is no longer registered.
6373 6362   * In this case, we'll silently not add the page to the hash.  We know that
6374 6363   * page retire will always be registered for the case where we are currently
6375 6364   * unretiring a page and thus there are no conflicts.
6376 6365   */
6377 6366  static void
6378 6367  page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6379 6368  {
6380 6369          page_capture_hash_bucket_t *bp1;
6381 6370          page_capture_hash_bucket_t *bp2;
6382 6371          int index;
6383 6372          int cb_index;
6384 6373          int i;
6385 6374          uchar_t pri;
6386 6375  #ifdef DEBUG
6387 6376          page_capture_hash_bucket_t *tp1;
6388 6377          int l;
6389 6378  #endif
6390 6379  
6391 6380          ASSERT(!(flags & CAPTURE_ASYNC));
6392 6381  
6393 6382          bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6394 6383  
6395 6384          bp1->pp = pp;
6396 6385          bp1->szc = szc;
6397 6386          bp1->flags = flags;
6398 6387          bp1->datap = datap;
6399 6388  
6400 6389          for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6401 6390                  if ((flags >> cb_index) & 1) {
6402 6391                          break;
6403 6392                  }
6404 6393          }
6405 6394  
6406 6395          ASSERT(cb_index != PC_NUM_CALLBACKS);
6407 6396  
6408 6397          rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6409 6398          if (pc_cb[cb_index].cb_active) {
6410 6399                  if (pc_cb[cb_index].duration == -1) {
6411 6400                          bp1->expires = (clock_t)-1;
6412 6401                  } else {
6413 6402                          bp1->expires = ddi_get_lbolt() +
6414 6403                              pc_cb[cb_index].duration;
6415 6404                  }
6416 6405          } else {
6417 6406                  /* There's no callback registered so don't add to the hash */
6418 6407                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6419 6408                  kmem_free(bp1, sizeof (*bp1));
6420 6409                  return;
6421 6410          }
6422 6411  
6423 6412          index = PAGE_CAPTURE_HASH(pp);
6424 6413  
6425 6414          /*
6426 6415           * Only allow capture flag to be modified under this mutex.
6427 6416           * Prevents multiple entries for same page getting added.
6428 6417           */
6429 6418          mutex_enter(&page_capture_hash[index].pchh_mutex);
6430 6419  
6431 6420          /*
6432 6421           * if not already on the hash, set capture bit and add to the hash
6433 6422           */
6434 6423          if (!(pp->p_toxic & PR_CAPTURE)) {
6435 6424  #ifdef DEBUG
6436 6425                  /* Check for duplicate entries */
6437 6426                  for (l = 0; l < 2; l++) {
6438 6427                          tp1 = page_capture_hash[index].lists[l].next;
6439 6428                          while (tp1 != &page_capture_hash[index].lists[l]) {
6440 6429                                  if (tp1->pp == pp) {
6441 6430                                          panic("page pp 0x%p already on hash "
6442 6431                                              "at 0x%p\n",
6443 6432                                              (void *)pp, (void *)tp1);
6444 6433                                  }
6445 6434                                  tp1 = tp1->next;
6446 6435                          }
6447 6436                  }
6448 6437  
6449 6438  #endif
6450 6439                  page_settoxic(pp, PR_CAPTURE);
6451 6440                  pri = PAGE_CAPTURE_PRIO(pp);
6452 6441                  bp1->pri = pri;
6453 6442                  bp1->next = page_capture_hash[index].lists[0].next;
6454 6443                  bp1->prev = &page_capture_hash[index].lists[0];
6455 6444                  bp1->next->prev = bp1;
6456 6445                  page_capture_hash[index].lists[0].next = bp1;
6457 6446                  page_capture_hash[index].num_pages[pri]++;
6458 6447                  if (flags & CAPTURE_RETIRE) {
6459 6448                          page_retire_incr_pend_count(datap);
6460 6449                  }
6461 6450                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6462 6451                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6463 6452                  cv_signal(&pc_cv);
6464 6453                  return;
6465 6454          }
6466 6455  
6467 6456          /*
6468 6457           * A page retire request will replace any other request.
6469 6458           * A second physmem request which is for a different process than
6470 6459           * the currently registered one will be dropped as there is
6471 6460           * no way to hold the private data for both calls.
6472 6461           * In the future, once there are more callers, this will have to
6473 6462           * be worked out better as there needs to be private storage for
6474 6463           * at least each type of caller (maybe have datap be an array of
6475 6464           * *void's so that we can index based upon callers index).
6476 6465           */
6477 6466  
6478 6467          /* walk hash list to update expire time */
6479 6468          for (i = 0; i < 2; i++) {
6480 6469                  bp2 = page_capture_hash[index].lists[i].next;
6481 6470                  while (bp2 != &page_capture_hash[index].lists[i]) {
6482 6471                          if (bp2->pp == pp) {
6483 6472                                  if (flags & CAPTURE_RETIRE) {
6484 6473                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6485 6474                                                  page_retire_incr_pend_count(
6486 6475                                                      datap);
6487 6476                                                  bp2->flags = flags;
6488 6477                                                  bp2->expires = bp1->expires;
6489 6478                                                  bp2->datap = datap;
6490 6479                                          }
6491 6480                                  } else {
6492 6481                                          ASSERT(flags & CAPTURE_PHYSMEM);
6493 6482                                          if (!(bp2->flags & CAPTURE_RETIRE) &&
6494 6483                                              (datap == bp2->datap)) {
6495 6484                                                  bp2->expires = bp1->expires;
6496 6485                                          }
6497 6486                                  }
6498 6487                                  mutex_exit(&page_capture_hash[index].
6499 6488                                      pchh_mutex);
6500 6489                                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6501 6490                                  kmem_free(bp1, sizeof (*bp1));
6502 6491                                  return;
6503 6492                          }
6504 6493                          bp2 = bp2->next;
6505 6494                  }
6506 6495          }
6507 6496  
6508 6497          /*
6509 6498           * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6510 6499           * and thus it either has to be set or not set and can't change
6511 6500           * while holding the mutex above.
6512 6501           */
6513 6502          panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6514 6503              (void *)pp);
6515 6504  }
6516 6505  
6517 6506  /*
6518 6507   * We have a page in our hands, lets try and make it ours by turning
6519 6508   * it into a clean page like it had just come off the freelists.
6520 6509   *
6521 6510   * Returns 0 on success, with the page still EXCL locked.
6522 6511   * On failure, the page will be unlocked, and returns EAGAIN
6523 6512   */
6524 6513  static int
6525 6514  page_capture_clean_page(page_t *pp)
6526 6515  {
6527 6516          page_t *newpp;
6528 6517          int skip_unlock = 0;
6529 6518          spgcnt_t count;
6530 6519          page_t *tpp;
6531 6520          int ret = 0;
6532 6521          int extra;
6533 6522  
6534 6523          ASSERT(PAGE_EXCL(pp));
6535 6524          ASSERT(!PP_RETIRED(pp));
6536 6525          ASSERT(curthread->t_flag & T_CAPTURING);
6537 6526  
6538 6527          if (PP_ISFREE(pp)) {
6539 6528                  if (!page_reclaim(pp, NULL)) {
6540 6529                          skip_unlock = 1;
6541 6530                          ret = EAGAIN;
6542 6531                          goto cleanup;
6543 6532                  }
6544 6533                  ASSERT(pp->p_szc == 0);
6545 6534                  if (pp->p_vnode != NULL) {
6546 6535                          /*
6547 6536                           * Since this page came from the
6548 6537                           * cachelist, we must destroy the
6549 6538                           * old vnode association.
6550 6539                           */
6551 6540                          page_hashout(pp, NULL);
6552 6541                  }
6553 6542                  goto cleanup;
6554 6543          }
6555 6544  
6556 6545          /*
6557 6546           * If we know page_relocate will fail, skip it
6558 6547           * It could still fail due to a UE on another page but we
6559 6548           * can't do anything about that.
6560 6549           */
6561 6550          if (pp->p_toxic & PR_UE) {
6562 6551                  goto skip_relocate;
6563 6552          }
6564 6553  
6565 6554          /*
6566 6555           * It's possible that pages can not have a vnode as fsflush comes
6567 6556           * through and cleans up these pages.  It's ugly but that's how it is.
6568 6557           */
6569 6558          if (pp->p_vnode == NULL) {
6570 6559                  goto skip_relocate;
6571 6560          }
6572 6561  
6573 6562          /*
6574 6563           * Page was not free, so lets try to relocate it.
6575 6564           * page_relocate only works with root pages, so if this is not a root
6576 6565           * page, we need to demote it to try and relocate it.
6577 6566           * Unfortunately this is the best we can do right now.
6578 6567           */
6579 6568          newpp = NULL;
6580 6569          if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6581 6570                  if (page_try_demote_pages(pp) == 0) {
6582 6571                          ret = EAGAIN;
6583 6572                          goto cleanup;
6584 6573                  }
6585 6574          }
6586 6575          ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6587 6576          if (ret == 0) {
6588 6577                  page_t *npp;
6589 6578                  /* unlock the new page(s) */
6590 6579                  while (count-- > 0) {
6591 6580                          ASSERT(newpp != NULL);
6592 6581                          npp = newpp;
6593 6582                          page_sub(&newpp, npp);
6594 6583                          page_unlock(npp);
6595 6584                  }
6596 6585                  ASSERT(newpp == NULL);
6597 6586                  /*
6598 6587                   * Check to see if the page we have is too large.
6599 6588                   * If so, demote it freeing up the extra pages.
6600 6589                   */
6601 6590                  if (pp->p_szc > 0) {
6602 6591                          /* For now demote extra pages to szc == 0 */
6603 6592                          extra = page_get_pagecnt(pp->p_szc) - 1;
6604 6593                          while (extra > 0) {
6605 6594                                  tpp = pp->p_next;
6606 6595                                  page_sub(&pp, tpp);
6607 6596                                  tpp->p_szc = 0;
6608 6597                                  page_free(tpp, 1);
6609 6598                                  extra--;
6610 6599                          }
6611 6600                          /* Make sure to set our page to szc 0 as well */
6612 6601                          ASSERT(pp->p_next == pp && pp->p_prev == pp);
6613 6602                          pp->p_szc = 0;
6614 6603                  }
6615 6604                  goto cleanup;
6616 6605          } else if (ret == EIO) {
6617 6606                  ret = EAGAIN;
6618 6607                  goto cleanup;
6619 6608          } else {
6620 6609                  /*
6621 6610                   * Need to reset return type as we failed to relocate the page
6622 6611                   * but that does not mean that some of the next steps will not
6623 6612                   * work.
6624 6613                   */
6625 6614                  ret = 0;
6626 6615          }
6627 6616  
6628 6617  skip_relocate:
6629 6618  
6630 6619          if (pp->p_szc > 0) {
6631 6620                  if (page_try_demote_pages(pp) == 0) {
6632 6621                          ret = EAGAIN;
6633 6622                          goto cleanup;
6634 6623                  }
6635 6624          }
6636 6625  
6637 6626          ASSERT(pp->p_szc == 0);
6638 6627  
6639 6628          if (hat_ismod(pp)) {
6640 6629                  ret = EAGAIN;
6641 6630                  goto cleanup;
6642 6631          }
6643 6632          if (PP_ISKAS(pp)) {
6644 6633                  ret = EAGAIN;
6645 6634                  goto cleanup;
6646 6635          }
6647 6636          if (pp->p_lckcnt || pp->p_cowcnt) {
6648 6637                  ret = EAGAIN;
6649 6638                  goto cleanup;
6650 6639          }
6651 6640  
6652 6641          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6653 6642          ASSERT(!hat_page_is_mapped(pp));
6654 6643  
6655 6644          if (hat_ismod(pp)) {
6656 6645                  /*
6657 6646                   * This is a semi-odd case as the page is now modified but not
6658 6647                   * mapped as we just unloaded the mappings above.
6659 6648                   */
6660 6649                  ret = EAGAIN;
6661 6650                  goto cleanup;
6662 6651          }
6663 6652          if (pp->p_vnode != NULL) {
6664 6653                  page_hashout(pp, NULL);
6665 6654          }
6666 6655  
6667 6656          /*
6668 6657           * At this point, the page should be in a clean state and
6669 6658           * we can do whatever we want with it.
6670 6659           */
6671 6660  
6672 6661  cleanup:
6673 6662          if (ret != 0) {
6674 6663                  if (!skip_unlock) {
6675 6664                          page_unlock(pp);
6676 6665                  }
6677 6666          } else {
6678 6667                  ASSERT(pp->p_szc == 0);
6679 6668                  ASSERT(PAGE_EXCL(pp));
6680 6669  
6681 6670                  pp->p_next = pp;
6682 6671                  pp->p_prev = pp;
6683 6672          }
6684 6673          return (ret);
6685 6674  }
6686 6675  
6687 6676  /*
6688 6677   * Various callers of page_trycapture() can have different restrictions upon
6689 6678   * what memory they have access to.
6690 6679   * Returns 0 on success, with the following error codes on failure:
6691 6680   *      EPERM - The requested page is long term locked, and thus repeated
6692 6681   *              requests to capture this page will likely fail.
6693 6682   *      ENOMEM - There was not enough free memory in the system to safely
6694 6683   *              map the requested page.
6695 6684   *      ENOENT - The requested page was inside the kernel cage, and the
6696 6685   *              PHYSMEM_CAGE flag was not set.
6697 6686   */
6698 6687  int
6699 6688  page_capture_pre_checks(page_t *pp, uint_t flags)
6700 6689  {
6701 6690          ASSERT(pp != NULL);
6702 6691  
6703 6692  #if defined(__sparc)
6704 6693          if (pp->p_vnode == &promvp) {
6705 6694                  return (EPERM);
6706 6695          }
6707 6696  
6708 6697          if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6709 6698              (flags & CAPTURE_PHYSMEM)) {
6710 6699                  return (ENOENT);
6711 6700          }
6712 6701  
6713 6702          if (PP_ISNORELOCKERNEL(pp)) {
6714 6703                  return (EPERM);
6715 6704          }
6716 6705  #else
6717 6706          if (PP_ISKAS(pp)) {
6718 6707                  return (EPERM);
6719 6708          }
6720 6709  #endif /* __sparc */
6721 6710  
6722 6711          /* only physmem currently has the restrictions checked below */
6723 6712          if (!(flags & CAPTURE_PHYSMEM)) {
6724 6713                  return (0);
6725 6714          }
6726 6715  
6727 6716          if (availrmem < swapfs_minfree) {
6728 6717                  /*
6729 6718                   * We won't try to capture this page as we are
6730 6719                   * running low on memory.
6731 6720                   */
6732 6721                  return (ENOMEM);
6733 6722          }
6734 6723          return (0);
6735 6724  }
6736 6725  
6737 6726  /*
6738 6727   * Once we have a page in our mits, go ahead and complete the capture
6739 6728   * operation.
6740 6729   * Returns 1 on failure where page is no longer needed
6741 6730   * Returns 0 on success
6742 6731   * Returns -1 if there was a transient failure.
6743 6732   * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6744 6733   */
6745 6734  int
6746 6735  page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6747 6736  {
6748 6737          int cb_index;
6749 6738          int ret = 0;
6750 6739          page_capture_hash_bucket_t *bp1;
6751 6740          page_capture_hash_bucket_t *bp2;
6752 6741          int index;
6753 6742          int found = 0;
6754 6743          int i;
6755 6744  
6756 6745          ASSERT(PAGE_EXCL(pp));
6757 6746          ASSERT(curthread->t_flag & T_CAPTURING);
6758 6747  
6759 6748          for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6760 6749                  if ((flags >> cb_index) & 1) {
6761 6750                          break;
6762 6751                  }
6763 6752          }
6764 6753          ASSERT(cb_index < PC_NUM_CALLBACKS);
6765 6754  
6766 6755          /*
6767 6756           * Remove the entry from the page_capture hash, but don't free it yet
6768 6757           * as we may need to put it back.
6769 6758           * Since we own the page at this point in time, we should find it
6770 6759           * in the hash if this is an ASYNC call.  If we don't it's likely
6771 6760           * that the page_capture_async() thread decided that this request
6772 6761           * had expired, in which case we just continue on.
6773 6762           */
6774 6763          if (flags & CAPTURE_ASYNC) {
6775 6764  
6776 6765                  index = PAGE_CAPTURE_HASH(pp);
6777 6766  
6778 6767                  mutex_enter(&page_capture_hash[index].pchh_mutex);
6779 6768                  for (i = 0; i < 2 && !found; i++) {
6780 6769                          bp1 = page_capture_hash[index].lists[i].next;
6781 6770                          while (bp1 != &page_capture_hash[index].lists[i]) {
6782 6771                                  if (bp1->pp == pp) {
6783 6772                                          bp1->next->prev = bp1->prev;
6784 6773                                          bp1->prev->next = bp1->next;
6785 6774                                          page_capture_hash[index].
6786 6775                                              num_pages[bp1->pri]--;
6787 6776                                          page_clrtoxic(pp, PR_CAPTURE);
6788 6777                                          found = 1;
6789 6778                                          break;
6790 6779                                  }
6791 6780                                  bp1 = bp1->next;
6792 6781                          }
6793 6782                  }
6794 6783                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6795 6784          }
6796 6785  
6797 6786          /* Synchronize with the unregister func. */
6798 6787          rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6799 6788          if (!pc_cb[cb_index].cb_active) {
6800 6789                  page_free(pp, 1);
6801 6790                  rw_exit(&pc_cb[cb_index].cb_rwlock);
6802 6791                  if (found) {
6803 6792                          kmem_free(bp1, sizeof (*bp1));
6804 6793                  }
6805 6794                  return (1);
6806 6795          }
6807 6796  
6808 6797          /*
6809 6798           * We need to remove the entry from the page capture hash and turn off
6810 6799           * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6811 6800           * the entry here, and then based upon the return value, cleanup
6812 6801           * appropriately or re-add it to the hash, making sure that someone else
6813 6802           * hasn't already done so.
6814 6803           * It should be rare for the callback to fail and thus it's ok for
6815 6804           * the failure path to be a bit complicated as the success path is
6816 6805           * cleaner and the locking rules are easier to follow.
6817 6806           */
6818 6807  
6819 6808          ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6820 6809  
6821 6810          rw_exit(&pc_cb[cb_index].cb_rwlock);
6822 6811  
6823 6812          /*
6824 6813           * If this was an ASYNC request, we need to cleanup the hash if the
6825 6814           * callback was successful or if the request was no longer valid.
6826 6815           * For non-ASYNC requests, we return failure to map and the caller
6827 6816           * will take care of adding the request to the hash.
6828 6817           * Note also that the callback itself is responsible for the page
6829 6818           * at this point in time in terms of locking ...  The most common
6830 6819           * case for the failure path should just be a page_free.
6831 6820           */
6832 6821          if (ret >= 0) {
6833 6822                  if (found) {
6834 6823                          if (bp1->flags & CAPTURE_RETIRE) {
6835 6824                                  page_retire_decr_pend_count(datap);
6836 6825                          }
6837 6826                          kmem_free(bp1, sizeof (*bp1));
6838 6827                  }
6839 6828                  return (ret);
6840 6829          }
6841 6830          if (!found) {
6842 6831                  return (ret);
6843 6832          }
6844 6833  
6845 6834          ASSERT(flags & CAPTURE_ASYNC);
6846 6835  
6847 6836          /*
6848 6837           * Check for expiration time first as we can just free it up if it's
6849 6838           * expired.
6850 6839           */
6851 6840          if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6852 6841                  kmem_free(bp1, sizeof (*bp1));
6853 6842                  return (ret);
6854 6843          }
6855 6844  
6856 6845          /*
6857 6846           * The callback failed and there used to be an entry in the hash for
6858 6847           * this page, so we need to add it back to the hash.
6859 6848           */
6860 6849          mutex_enter(&page_capture_hash[index].pchh_mutex);
6861 6850          if (!(pp->p_toxic & PR_CAPTURE)) {
6862 6851                  /* just add bp1 back to head of walked list */
6863 6852                  page_settoxic(pp, PR_CAPTURE);
6864 6853                  bp1->next = page_capture_hash[index].lists[1].next;
6865 6854                  bp1->prev = &page_capture_hash[index].lists[1];
6866 6855                  bp1->next->prev = bp1;
6867 6856                  bp1->pri = PAGE_CAPTURE_PRIO(pp);
6868 6857                  page_capture_hash[index].lists[1].next = bp1;
6869 6858                  page_capture_hash[index].num_pages[bp1->pri]++;
6870 6859                  mutex_exit(&page_capture_hash[index].pchh_mutex);
6871 6860                  return (ret);
6872 6861          }
6873 6862  
6874 6863          /*
6875 6864           * Otherwise there was a new capture request added to list
6876 6865           * Need to make sure that our original data is represented if
6877 6866           * appropriate.
6878 6867           */
6879 6868          for (i = 0; i < 2; i++) {
6880 6869                  bp2 = page_capture_hash[index].lists[i].next;
6881 6870                  while (bp2 != &page_capture_hash[index].lists[i]) {
6882 6871                          if (bp2->pp == pp) {
6883 6872                                  if (bp1->flags & CAPTURE_RETIRE) {
6884 6873                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6885 6874                                                  bp2->szc = bp1->szc;
6886 6875                                                  bp2->flags = bp1->flags;
6887 6876                                                  bp2->expires = bp1->expires;
6888 6877                                                  bp2->datap = bp1->datap;
6889 6878                                          }
6890 6879                                  } else {
6891 6880                                          ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6892 6881                                          if (!(bp2->flags & CAPTURE_RETIRE)) {
6893 6882                                                  bp2->szc = bp1->szc;
6894 6883                                                  bp2->flags = bp1->flags;
6895 6884                                                  bp2->expires = bp1->expires;
6896 6885                                                  bp2->datap = bp1->datap;
6897 6886                                          }
6898 6887                                  }
6899 6888                                  page_capture_hash[index].num_pages[bp2->pri]--;
6900 6889                                  bp2->pri = PAGE_CAPTURE_PRIO(pp);
6901 6890                                  page_capture_hash[index].num_pages[bp2->pri]++;
6902 6891                                  mutex_exit(&page_capture_hash[index].
6903 6892                                      pchh_mutex);
6904 6893                                  kmem_free(bp1, sizeof (*bp1));
6905 6894                                  return (ret);
6906 6895                          }
6907 6896                          bp2 = bp2->next;
6908 6897                  }
6909 6898          }
6910 6899          panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6911 6900          /*NOTREACHED*/
6912 6901  }
6913 6902  
6914 6903  /*
6915 6904   * Try to capture the given page for the caller specified in the flags
6916 6905   * parameter.  The page will either be captured and handed over to the
6917 6906   * appropriate callback, or will be queued up in the page capture hash
6918 6907   * to be captured asynchronously.
6919 6908   * If the current request is due to an async capture, the page must be
6920 6909   * exclusively locked before calling this function.
6921 6910   * Currently szc must be 0 but in the future this should be expandable to
6922 6911   * other page sizes.
6923 6912   * Returns 0 on success, with the following error codes on failure:
6924 6913   *      EPERM - The requested page is long term locked, and thus repeated
6925 6914   *              requests to capture this page will likely fail.
6926 6915   *      ENOMEM - There was not enough free memory in the system to safely
6927 6916   *              map the requested page.
6928 6917   *      ENOENT - The requested page was inside the kernel cage, and the
6929 6918   *              CAPTURE_GET_CAGE flag was not set.
6930 6919   *      EAGAIN - The requested page could not be capturead at this point in
6931 6920   *              time but future requests will likely work.
6932 6921   *      EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6933 6922   *              was not set.
6934 6923   */
6935 6924  int
6936 6925  page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6937 6926  {
6938 6927          int ret;
6939 6928          int cb_index;
6940 6929  
6941 6930          if (flags & CAPTURE_ASYNC) {
6942 6931                  ASSERT(PAGE_EXCL(pp));
6943 6932                  goto async;
6944 6933          }
6945 6934  
6946 6935          /* Make sure there's enough availrmem ... */
6947 6936          ret = page_capture_pre_checks(pp, flags);
6948 6937          if (ret != 0) {
6949 6938                  return (ret);
6950 6939          }
6951 6940  
6952 6941          if (!page_trylock(pp, SE_EXCL)) {
6953 6942                  for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6954 6943                          if ((flags >> cb_index) & 1) {
6955 6944                                  break;
6956 6945                          }
6957 6946                  }
6958 6947                  ASSERT(cb_index < PC_NUM_CALLBACKS);
6959 6948                  ret = EAGAIN;
6960 6949                  /* Special case for retired pages */
6961 6950                  if (PP_RETIRED(pp)) {
6962 6951                          if (flags & CAPTURE_GET_RETIRED) {
6963 6952                                  if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6964 6953                                          /*
6965 6954                                           * Need to set capture bit and add to
6966 6955                                           * hash so that the page will be
6967 6956                                           * retired when freed.
6968 6957                                           */
6969 6958                                          page_capture_add_hash(pp, szc,
6970 6959                                              CAPTURE_RETIRE, NULL);
6971 6960                                          ret = 0;
6972 6961                                          goto own_page;
6973 6962                                  }
6974 6963                          } else {
6975 6964                                  return (EBUSY);
6976 6965                          }
6977 6966                  }
6978 6967                  page_capture_add_hash(pp, szc, flags, datap);
6979 6968                  return (ret);
6980 6969          }
6981 6970  
6982 6971  async:
6983 6972          ASSERT(PAGE_EXCL(pp));
6984 6973  
6985 6974          /* Need to check for physmem async requests that availrmem is sane */
6986 6975          if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6987 6976              (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6988 6977              (availrmem < swapfs_minfree)) {
6989 6978                  page_unlock(pp);
6990 6979                  return (ENOMEM);
6991 6980          }
6992 6981  
6993 6982          ret = page_capture_clean_page(pp);
6994 6983  
6995 6984          if (ret != 0) {
6996 6985                  /* We failed to get the page, so lets add it to the hash */
6997 6986                  if (!(flags & CAPTURE_ASYNC)) {
6998 6987                          page_capture_add_hash(pp, szc, flags, datap);
6999 6988                  }
7000 6989                  return (ret);
7001 6990          }
7002 6991  
7003 6992  own_page:
7004 6993          ASSERT(PAGE_EXCL(pp));
7005 6994          ASSERT(pp->p_szc == 0);
7006 6995  
7007 6996          /* Call the callback */
7008 6997          ret = page_capture_take_action(pp, flags, datap);
7009 6998  
7010 6999          if (ret == 0) {
7011 7000                  return (0);
7012 7001          }
7013 7002  
7014 7003          /*
7015 7004           * Note that in the failure cases from page_capture_take_action, the
7016 7005           * EXCL lock will have already been dropped.
7017 7006           */
7018 7007          if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7019 7008                  page_capture_add_hash(pp, szc, flags, datap);
7020 7009          }
7021 7010          return (EAGAIN);
7022 7011  }
7023 7012  
7024 7013  int
7025 7014  page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7026 7015  {
7027 7016          int ret;
7028 7017  
7029 7018          curthread->t_flag |= T_CAPTURING;
7030 7019          ret = page_itrycapture(pp, szc, flags, datap);
7031 7020          curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7032 7021          return (ret);
7033 7022  }
7034 7023  
7035 7024  /*
7036 7025   * When unlocking a page which has the PR_CAPTURE bit set, this routine
7037 7026   * gets called to try and capture the page.
7038 7027   */
7039 7028  void
7040 7029  page_unlock_capture(page_t *pp)
7041 7030  {
7042 7031          page_capture_hash_bucket_t *bp;
7043 7032          int index;
7044 7033          int i;
7045 7034          uint_t szc;
7046 7035          uint_t flags = 0;
7047 7036          void *datap;
7048 7037          kmutex_t *mp;
7049 7038          extern vnode_t retired_pages;
7050 7039  
7051 7040          /*
7052 7041           * We need to protect against a possible deadlock here where we own
7053 7042           * the vnode page hash mutex and want to acquire it again as there
7054 7043           * are locations in the code, where we unlock a page while holding
7055 7044           * the mutex which can lead to the page being captured and eventually
7056 7045           * end up here.  As we may be hashing out the old page and hashing into
7057 7046           * the retire vnode, we need to make sure we don't own them.
7058 7047           * Other callbacks who do hash operations also need to make sure that
7059 7048           * before they hashin to a vnode that they do not currently own the
7060 7049           * vphm mutex otherwise there will be a panic.
7061 7050           */
7062 7051          if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7063 7052                  page_unlock_nocapture(pp);
7064 7053                  return;
7065 7054          }
7066 7055          if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7067 7056                  page_unlock_nocapture(pp);
7068 7057                  return;
7069 7058          }
7070 7059  
7071 7060          index = PAGE_CAPTURE_HASH(pp);
7072 7061  
7073 7062          mp = &page_capture_hash[index].pchh_mutex;
7074 7063          mutex_enter(mp);
7075 7064          for (i = 0; i < 2; i++) {
7076 7065                  bp = page_capture_hash[index].lists[i].next;
7077 7066                  while (bp != &page_capture_hash[index].lists[i]) {
7078 7067                          if (bp->pp == pp) {
7079 7068                                  szc = bp->szc;
7080 7069                                  flags = bp->flags | CAPTURE_ASYNC;
7081 7070                                  datap = bp->datap;
7082 7071                                  mutex_exit(mp);
7083 7072                                  (void) page_trycapture(pp, szc, flags, datap);
7084 7073                                  return;
7085 7074                          }
7086 7075                          bp = bp->next;
7087 7076                  }
7088 7077          }
7089 7078  
7090 7079          /* Failed to find page in hash so clear flags and unlock it. */
7091 7080          page_clrtoxic(pp, PR_CAPTURE);
7092 7081          page_unlock(pp);
7093 7082  
7094 7083          mutex_exit(mp);
7095 7084  }
7096 7085  
7097 7086  void
7098 7087  page_capture_init()
7099 7088  {
7100 7089          int i;
7101 7090          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7102 7091                  page_capture_hash[i].lists[0].next =
7103 7092                      &page_capture_hash[i].lists[0];
7104 7093                  page_capture_hash[i].lists[0].prev =
7105 7094                      &page_capture_hash[i].lists[0];
7106 7095                  page_capture_hash[i].lists[1].next =
7107 7096                      &page_capture_hash[i].lists[1];
7108 7097                  page_capture_hash[i].lists[1].prev =
7109 7098                      &page_capture_hash[i].lists[1];
7110 7099          }
7111 7100  
7112 7101          pc_thread_shortwait = 23 * hz;
7113 7102          pc_thread_longwait = 1201 * hz;
7114 7103          pc_thread_retry = 3;
7115 7104          mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7116 7105          cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7117 7106          pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7118 7107              TS_RUN, minclsyspri);
7119 7108  }
7120 7109  
7121 7110  /*
7122 7111   * It is necessary to scrub any failing pages prior to reboot in order to
7123 7112   * prevent a latent error trap from occurring on the next boot.
7124 7113   */
7125 7114  void
7126 7115  page_retire_mdboot()
7127 7116  {
7128 7117          page_t *pp;
7129 7118          int i, j;
7130 7119          page_capture_hash_bucket_t *bp;
7131 7120          uchar_t pri;
7132 7121  
7133 7122          /* walk lists looking for pages to scrub */
7134 7123          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7135 7124                  for (pri = 0; pri < PC_NUM_PRI; pri++) {
7136 7125                          if (page_capture_hash[i].num_pages[pri] != 0) {
7137 7126                                  break;
7138 7127                          }
7139 7128                  }
7140 7129                  if (pri == PC_NUM_PRI)
7141 7130                          continue;
7142 7131  
7143 7132                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7144 7133  
7145 7134                  for (j = 0; j < 2; j++) {
7146 7135                          bp = page_capture_hash[i].lists[j].next;
7147 7136                          while (bp != &page_capture_hash[i].lists[j]) {
7148 7137                                  pp = bp->pp;
7149 7138                                  if (PP_TOXIC(pp)) {
7150 7139                                          if (page_trylock(pp, SE_EXCL)) {
7151 7140                                                  PP_CLRFREE(pp);
7152 7141                                                  pagescrub(pp, 0, PAGESIZE);
7153 7142                                                  page_unlock(pp);
7154 7143                                          }
7155 7144                                  }
7156 7145                                  bp = bp->next;
7157 7146                          }
7158 7147                  }
7159 7148                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7160 7149          }
7161 7150  }
7162 7151  
7163 7152  /*
7164 7153   * Walk the page_capture_hash trying to capture pages and also cleanup old
7165 7154   * entries which have expired.
7166 7155   */
7167 7156  void
7168 7157  page_capture_async()
7169 7158  {
7170 7159          page_t *pp;
7171 7160          int i;
7172 7161          int ret;
7173 7162          page_capture_hash_bucket_t *bp1, *bp2;
7174 7163          uint_t szc;
7175 7164          uint_t flags;
7176 7165          void *datap;
7177 7166          uchar_t pri;
7178 7167  
7179 7168          /* If there are outstanding pages to be captured, get to work */
7180 7169          for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7181 7170                  for (pri = 0; pri < PC_NUM_PRI; pri++) {
7182 7171                          if (page_capture_hash[i].num_pages[pri] != 0)
7183 7172                                  break;
7184 7173                  }
7185 7174                  if (pri == PC_NUM_PRI)
7186 7175                          continue;
7187 7176  
7188 7177                  /* Append list 1 to list 0 and then walk through list 0 */
7189 7178                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7190 7179                  bp1 = &page_capture_hash[i].lists[1];
7191 7180                  bp2 = bp1->next;
7192 7181                  if (bp1 != bp2) {
7193 7182                          bp1->prev->next = page_capture_hash[i].lists[0].next;
7194 7183                          bp2->prev = &page_capture_hash[i].lists[0];
7195 7184                          page_capture_hash[i].lists[0].next->prev = bp1->prev;
7196 7185                          page_capture_hash[i].lists[0].next = bp2;
7197 7186                          bp1->next = bp1;
7198 7187                          bp1->prev = bp1;
7199 7188                  }
7200 7189  
7201 7190                  /* list[1] will be empty now */
7202 7191  
7203 7192                  bp1 = page_capture_hash[i].lists[0].next;
7204 7193                  while (bp1 != &page_capture_hash[i].lists[0]) {
7205 7194                          /* Check expiration time */
7206 7195                          if ((ddi_get_lbolt() > bp1->expires &&
7207 7196                              bp1->expires != -1) ||
7208 7197                              page_deleted(bp1->pp)) {
7209 7198                                  page_capture_hash[i].lists[0].next = bp1->next;
7210 7199                                  bp1->next->prev =
7211 7200                                      &page_capture_hash[i].lists[0];
7212 7201                                  page_capture_hash[i].num_pages[bp1->pri]--;
7213 7202  
7214 7203                                  /*
7215 7204                                   * We can safely remove the PR_CAPTURE bit
7216 7205                                   * without holding the EXCL lock on the page
7217 7206                                   * as the PR_CAPTURE bit requres that the
7218 7207                                   * page_capture_hash[].pchh_mutex be held
7219 7208                                   * to modify it.
7220 7209                                   */
7221 7210                                  page_clrtoxic(bp1->pp, PR_CAPTURE);
7222 7211                                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7223 7212                                  kmem_free(bp1, sizeof (*bp1));
7224 7213                                  mutex_enter(&page_capture_hash[i].pchh_mutex);
7225 7214                                  bp1 = page_capture_hash[i].lists[0].next;
7226 7215                                  continue;
7227 7216                          }
7228 7217                          pp = bp1->pp;
7229 7218                          szc = bp1->szc;
7230 7219                          flags = bp1->flags;
7231 7220                          datap = bp1->datap;
7232 7221                          mutex_exit(&page_capture_hash[i].pchh_mutex);
7233 7222                          if (page_trylock(pp, SE_EXCL)) {
7234 7223                                  ret = page_trycapture(pp, szc,
7235 7224                                      flags | CAPTURE_ASYNC, datap);
7236 7225                          } else {
7237 7226                                  ret = 1;        /* move to walked hash */
7238 7227                          }
7239 7228  
7240 7229                          if (ret != 0) {
7241 7230                                  /* Move to walked hash */
7242 7231                                  (void) page_capture_move_to_walked(pp);
7243 7232                          }
7244 7233                          mutex_enter(&page_capture_hash[i].pchh_mutex);
7245 7234                          bp1 = page_capture_hash[i].lists[0].next;
7246 7235                  }
7247 7236  
7248 7237                  mutex_exit(&page_capture_hash[i].pchh_mutex);
7249 7238          }
7250 7239  }
7251 7240  
7252 7241  /*
7253 7242   * This function is called by the page_capture_thread, and is needed in
7254 7243   * in order to initiate aio cleanup, so that pages used in aio
7255 7244   * will be unlocked and subsequently retired by page_capture_thread.
7256 7245   */
7257 7246  static int
7258 7247  do_aio_cleanup(void)
7259 7248  {
7260 7249          proc_t *procp;
7261 7250          int (*aio_cleanup_dr_delete_memory)(proc_t *);
7262 7251          int cleaned = 0;
7263 7252  
7264 7253          if (modload("sys", "kaio") == -1) {
7265 7254                  cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7266 7255                  return (0);
7267 7256          }
7268 7257          /*
7269 7258           * We use the aio_cleanup_dr_delete_memory function to
7270 7259           * initiate the actual clean up; this function will wake
7271 7260           * up the per-process aio_cleanup_thread.
7272 7261           */
7273 7262          aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7274 7263              modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7275 7264          if (aio_cleanup_dr_delete_memory == NULL) {
7276 7265                  cmn_err(CE_WARN,
7277 7266              "aio_cleanup_dr_delete_memory not found in kaio");
7278 7267                  return (0);
7279 7268          }
7280 7269          mutex_enter(&pidlock);
7281 7270          for (procp = practive; (procp != NULL); procp = procp->p_next) {
7282 7271                  mutex_enter(&procp->p_lock);
7283 7272                  if (procp->p_aio != NULL) {
7284 7273                          /* cleanup proc's outstanding kaio */
7285 7274                          cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7286 7275                  }
7287 7276                  mutex_exit(&procp->p_lock);
7288 7277          }
7289 7278          mutex_exit(&pidlock);
7290 7279          return (cleaned);
7291 7280  }
7292 7281  
7293 7282  /*
7294 7283   * helper function for page_capture_thread
7295 7284   */
7296 7285  static void
7297 7286  page_capture_handle_outstanding(void)
7298 7287  {
7299 7288          int ntry;
7300 7289  
7301 7290          /* Reap pages before attempting capture pages */
7302 7291          kmem_reap();
7303 7292  
7304 7293          if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7305 7294              hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7306 7295                  /*
7307 7296                   * Note: Purging only for platforms that support
7308 7297                   * ISM hat_pageunload() - mainly SPARC. On x86/x64
7309 7298                   * platforms ISM pages SE_SHARED locked until destroyed.
7310 7299                   */
7311 7300  
7312 7301                  /* disable and purge seg_pcache */
7313 7302                  (void) seg_p_disable();
7314 7303                  for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7315 7304                          if (!page_retire_pend_count())
7316 7305                                  break;
7317 7306                          if (do_aio_cleanup()) {
7318 7307                                  /*
7319 7308                                   * allow the apps cleanup threads
7320 7309                                   * to run
7321 7310                                   */
7322 7311                                  delay(pc_thread_shortwait);
7323 7312                          }
7324 7313                          page_capture_async();
7325 7314                  }
7326 7315                  /* reenable seg_pcache */
7327 7316                  seg_p_enable();
7328 7317  
7329 7318                  /* completed what can be done.  break out */
7330 7319                  return;
7331 7320          }
7332 7321  
7333 7322          /*
7334 7323           * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7335 7324           * and then attempt to capture.
7336 7325           */
7337 7326          seg_preap();
7338 7327          page_capture_async();
7339 7328  }
7340 7329  
7341 7330  /*
7342 7331   * The page_capture_thread loops forever, looking to see if there are
7343 7332   * pages still waiting to be captured.
7344 7333   */
7345 7334  static void
7346 7335  page_capture_thread(void)
7347 7336  {
7348 7337          callb_cpr_t c;
7349 7338          int i;
7350 7339          int high_pri_pages;
7351 7340          int low_pri_pages;
7352 7341          clock_t timeout;
7353 7342  
7354 7343          CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7355 7344  
7356 7345          mutex_enter(&pc_thread_mutex);
7357 7346          for (;;) {
7358 7347                  high_pri_pages = 0;
7359 7348                  low_pri_pages = 0;
7360 7349                  for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7361 7350                          high_pri_pages +=
7362 7351                              page_capture_hash[i].num_pages[PC_PRI_HI];
7363 7352                          low_pri_pages +=
7364 7353                              page_capture_hash[i].num_pages[PC_PRI_LO];
7365 7354                  }
7366 7355  
7367 7356                  timeout = pc_thread_longwait;
7368 7357                  if (high_pri_pages != 0) {
7369 7358                          timeout = pc_thread_shortwait;
7370 7359                          page_capture_handle_outstanding();
7371 7360                  } else if (low_pri_pages != 0) {
7372 7361                          page_capture_async();
7373 7362                  }
7374 7363                  CALLB_CPR_SAFE_BEGIN(&c);
7375 7364                  (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7376 7365                      timeout, TR_CLOCK_TICK);
7377 7366                  CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7378 7367          }
7379 7368          /*NOTREACHED*/
7380 7369  }
7381 7370  /*
7382 7371   * Attempt to locate a bucket that has enough pages to satisfy the request.
7383 7372   * The initial check is done without the lock to avoid unneeded contention.
7384 7373   * The function returns 1 if enough pages were found, else 0 if it could not
7385 7374   * find enough pages in a bucket.
7386 7375   */
7387 7376  static int
7388 7377  pcf_decrement_bucket(pgcnt_t npages)
7389 7378  {
7390 7379          struct pcf      *p;
7391 7380          struct pcf      *q;
7392 7381          int i;
7393 7382  
7394 7383          p = &pcf[PCF_INDEX()];
7395 7384          q = &pcf[pcf_fanout];
7396 7385          for (i = 0; i < pcf_fanout; i++) {
7397 7386                  if (p->pcf_count > npages) {
7398 7387                          /*
7399 7388                           * a good one to try.
7400 7389                           */
7401 7390                          mutex_enter(&p->pcf_lock);
7402 7391                          if (p->pcf_count > npages) {
7403 7392                                  p->pcf_count -= (uint_t)npages;
7404 7393                                  /*
7405 7394                                   * freemem is not protected by any lock.
7406 7395                                   * Thus, we cannot have any assertion
7407 7396                                   * containing freemem here.
7408 7397                                   */
7409 7398                                  freemem -= npages;
7410 7399                                  mutex_exit(&p->pcf_lock);
7411 7400                                  return (1);
7412 7401                          }
7413 7402                          mutex_exit(&p->pcf_lock);
7414 7403                  }
7415 7404                  p++;
7416 7405                  if (p >= q) {
7417 7406                          p = pcf;
7418 7407                  }
7419 7408          }
7420 7409          return (0);
7421 7410  }
7422 7411  
7423 7412  /*
7424 7413   * Arguments:
7425 7414   *      pcftotal_ret:   If the value is not NULL and we have walked all the
7426 7415   *                      buckets but did not find enough pages then it will
7427 7416   *                      be set to the total number of pages in all the pcf
7428 7417   *                      buckets.
7429 7418   *      npages:         Is the number of pages we have been requested to
7430 7419   *                      find.
7431 7420   *      unlock:         If set to 0 we will leave the buckets locked if the
7432 7421   *                      requested number of pages are not found.
7433 7422   *
7434 7423   * Go and try to satisfy the page request  from any number of buckets.
7435 7424   * This can be a very expensive operation as we have to lock the buckets
7436 7425   * we are checking (and keep them locked), starting at bucket 0.
7437 7426   *
7438 7427   * The function returns 1 if enough pages were found, else 0 if it could not
7439 7428   * find enough pages in the buckets.
7440 7429   *
7441 7430   */
7442 7431  static int
7443 7432  pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7444 7433  {
7445 7434          struct pcf      *p;
7446 7435          pgcnt_t pcftotal;
7447 7436          int i;
7448 7437  
7449 7438          p = pcf;
7450 7439          /* try to collect pages from several pcf bins */
7451 7440          for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7452 7441                  mutex_enter(&p->pcf_lock);
7453 7442                  pcftotal += p->pcf_count;
7454 7443                  if (pcftotal >= npages) {
7455 7444                          /*
7456 7445                           * Wow!  There are enough pages laying around
7457 7446                           * to satisfy the request.  Do the accounting,
7458 7447                           * drop the locks we acquired, and go back.
7459 7448                           *
7460 7449                           * freemem is not protected by any lock. So,
7461 7450                           * we cannot have any assertion containing
7462 7451                           * freemem.
7463 7452                           */
7464 7453                          freemem -= npages;
7465 7454                          while (p >= pcf) {
7466 7455                                  if (p->pcf_count <= npages) {
7467 7456                                          npages -= p->pcf_count;
7468 7457                                          p->pcf_count = 0;
7469 7458                                  } else {
7470 7459                                          p->pcf_count -= (uint_t)npages;
7471 7460                                          npages = 0;
7472 7461                                  }
7473 7462                                  mutex_exit(&p->pcf_lock);
7474 7463                                  p--;
7475 7464                          }
7476 7465                          ASSERT(npages == 0);
7477 7466                          return (1);
7478 7467                  }
7479 7468                  p++;
7480 7469          }
7481 7470          if (unlock) {
7482 7471                  /* failed to collect pages - release the locks */
7483 7472                  while (--p >= pcf) {
7484 7473                          mutex_exit(&p->pcf_lock);
7485 7474                  }
7486 7475          }
7487 7476          if (pcftotal_ret != NULL)
7488 7477                  *pcftotal_ret = pcftotal;
7489 7478          return (0);
7490 7479  }

↓ open down ↓

4200 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX