6146-seg_inherit_notsup-is-redundant Wdiff usr/src/uts/common/vm/vm_seg.c

Print this page

6146 seg_inherit_notsup is redundant

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_seg.c
          +++ new/usr/src/uts/common/vm/vm_seg.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright (c) 2015, Joyent, Inc.
  25   25   * Copyright 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  26   26   */
  27   27  
  28   28  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29   29  /*        All Rights Reserved   */
  30   30  
  31   31  /*
  32   32   * University Copyright- Copyright (c) 1982, 1986, 1988
  33   33   * The Regents of the University of California
  34   34   * All Rights Reserved
  35   35   *
  36   36   * University Acknowledgment- Portions of this document are derived from
  37   37   * software developed by the University of California, Berkeley, and its
  38   38   * contributors.
  39   39   */
  40   40  
  41   41  /*
  42   42   * VM - segment management.
  43   43   */
  44   44  
  45   45  #include <sys/types.h>
  46   46  #include <sys/inttypes.h>
  47   47  #include <sys/t_lock.h>
  48   48  #include <sys/param.h>
  49   49  #include <sys/systm.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/sysmacros.h>
  52   52  #include <sys/vmsystm.h>
  53   53  #include <sys/tuneable.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/fs/swapnode.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/callb.h>
  58   58  #include <sys/mem_config.h>
  59   59  #include <sys/mman.h>
  60   60  
  61   61  #include <vm/hat.h>
  62   62  #include <vm/as.h>
  63   63  #include <vm/seg.h>
  64   64  #include <vm/seg_kmem.h>
  65   65  #include <vm/seg_spt.h>
  66   66  #include <vm/seg_vn.h>
  67   67  #include <vm/anon.h>
  68   68  
  69   69  /*
  70   70   * kstats for segment advise
  71   71   */
  72   72  segadvstat_t segadvstat = {
  73   73          { "MADV_FREE_hit",      KSTAT_DATA_ULONG },
  74   74          { "MADV_FREE_miss",     KSTAT_DATA_ULONG },
  75   75  };
  76   76  
  77   77  kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
  78   78  uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
  79   79  
  80   80  /*
  81   81   * entry in the segment page cache
  82   82   */
  83   83  struct seg_pcache {
  84   84          struct seg_pcache       *p_hnext;       /* list for hashed blocks */
  85   85          struct seg_pcache       *p_hprev;
  86   86          pcache_link_t           p_plink;        /* per segment/amp list */
  87   87          void                    *p_htag0;       /* segment/amp pointer */
  88   88          caddr_t                 p_addr;         /* base address/anon_idx */
  89   89          size_t                  p_len;          /* total bytes */
  90   90          size_t                  p_wlen;         /* writtable bytes at p_addr */
  91   91          struct page             **p_pp;         /* pp shadow list */
  92   92          seg_preclaim_cbfunc_t   p_callback;     /* reclaim callback function */
  93   93          clock_t                 p_lbolt;        /* lbolt from last use */
  94   94          struct seg_phash        *p_hashp;       /* our pcache hash bucket */
  95   95          uint_t                  p_active;       /* active count */
  96   96          uchar_t                 p_write;        /* true if S_WRITE */
  97   97          uchar_t                 p_ref;          /* reference byte */
  98   98          ushort_t                p_flags;        /* bit flags */
  99   99  };
 100  100  
 101  101  struct seg_phash {
 102  102          struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 103  103          struct seg_pcache       *p_hprev;
 104  104          kmutex_t                p_hmutex;       /* protects hash bucket */
 105  105          pcache_link_t           p_halink[2];    /* active bucket linkages */
 106  106  };
 107  107  
 108  108  struct seg_phash_wired {
 109  109          struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 110  110          struct seg_pcache       *p_hprev;
 111  111          kmutex_t                p_hmutex;       /* protects hash bucket */
 112  112  };
 113  113  
 114  114  /*
 115  115   * A parameter to control a maximum number of bytes that can be
 116  116   * purged from pcache at a time.
 117  117   */
 118  118  #define P_MAX_APURGE_BYTES      (1024 * 1024 * 1024)
 119  119  
 120  120  /*
 121  121   * log2(fraction of pcache to reclaim at a time).
 122  122   */
 123  123  #define P_SHRINK_SHFT           (5)
 124  124  
 125  125  /*
 126  126   * The following variables can be tuned via /etc/system.
 127  127   */
 128  128  
 129  129  int     segpcache_enabled = 1;          /* if 1, shadow lists are cached */
 130  130  pgcnt_t segpcache_maxwindow = 0;        /* max # of pages that can be cached */
 131  131  ulong_t segpcache_hashsize_win = 0;     /* # of non wired buckets */
 132  132  ulong_t segpcache_hashsize_wired = 0;   /* # of wired buckets */
 133  133  int     segpcache_reap_sec = 1;         /* reap check rate in secs */
 134  134  clock_t segpcache_reap_ticks = 0;       /* reap interval in ticks */
 135  135  int     segpcache_pcp_maxage_sec = 1;   /* pcp max age in secs */
 136  136  clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
 137  137  int     segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
 138  138  pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
 139  139  
 140  140  static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
 141  141  static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
 142  142  static kcondvar_t seg_pasync_cv;
 143  143  
 144  144  #pragma align 64(pctrl1)
 145  145  #pragma align 64(pctrl2)
 146  146  #pragma align 64(pctrl3)
 147  147  
 148  148  /*
 149  149   * Keep frequently used variables together in one cache line.
 150  150   */
 151  151  static struct p_ctrl1 {
 152  152          uint_t p_disabled;              /* if not 0, caching temporarily off */
 153  153          pgcnt_t p_maxwin;               /* max # of pages that can be cached */
 154  154          size_t p_hashwin_sz;            /* # of non wired buckets */
 155  155          struct seg_phash *p_htabwin;    /* hash table for non wired entries */
 156  156          size_t p_hashwired_sz;          /* # of wired buckets */
 157  157          struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
 158  158          kmem_cache_t *p_kmcache;        /* kmem cache for seg_pcache structs */
 159  159  #ifdef _LP64
 160  160          ulong_t pad[1];
 161  161  #endif /* _LP64 */
 162  162  } pctrl1;
 163  163  
 164  164  static struct p_ctrl2 {
 165  165          kmutex_t p_mem_mtx;     /* protects window counter and p_halinks */
 166  166          pgcnt_t  p_locked_win;  /* # pages from window */
 167  167          pgcnt_t  p_locked;      /* # of pages cached by pagelock */
 168  168          uchar_t  p_ahcur;       /* current active links for insert/delete */
 169  169          uchar_t  p_athr_on;     /* async reclaim thread is running. */
 170  170          pcache_link_t p_ahhead[2]; /* active buckets linkages */
 171  171  } pctrl2;
 172  172  
 173  173  static struct p_ctrl3 {
 174  174          clock_t p_pcp_maxage;           /* max pcp age in ticks */
 175  175          ulong_t p_athr_empty_ahb;       /* athread walk stats */
 176  176          ulong_t p_athr_full_ahb;        /* athread walk stats */
 177  177          pgcnt_t p_maxapurge_npages;     /* max pages to purge at a time */
 178  178          int     p_shrink_shft;          /* reap shift factor */
 179  179  #ifdef _LP64
 180  180          ulong_t pad[3];
 181  181  #endif /* _LP64 */
 182  182  } pctrl3;
 183  183  
 184  184  #define seg_pdisabled                   pctrl1.p_disabled
 185  185  #define seg_pmaxwindow                  pctrl1.p_maxwin
 186  186  #define seg_phashsize_win               pctrl1.p_hashwin_sz
 187  187  #define seg_phashtab_win                pctrl1.p_htabwin
 188  188  #define seg_phashsize_wired             pctrl1.p_hashwired_sz
 189  189  #define seg_phashtab_wired              pctrl1.p_htabwired
 190  190  #define seg_pkmcache                    pctrl1.p_kmcache
 191  191  #define seg_pmem_mtx                    pctrl2.p_mem_mtx
 192  192  #define seg_plocked_window              pctrl2.p_locked_win
 193  193  #define seg_plocked                     pctrl2.p_locked
 194  194  #define seg_pahcur                      pctrl2.p_ahcur
 195  195  #define seg_pathr_on                    pctrl2.p_athr_on
 196  196  #define seg_pahhead                     pctrl2.p_ahhead
 197  197  #define seg_pmax_pcpage                 pctrl3.p_pcp_maxage
 198  198  #define seg_pathr_empty_ahb             pctrl3.p_athr_empty_ahb
 199  199  #define seg_pathr_full_ahb              pctrl3.p_athr_full_ahb
 200  200  #define seg_pshrink_shift               pctrl3.p_shrink_shft
 201  201  #define seg_pmaxapurge_npages           pctrl3.p_maxapurge_npages
 202  202  
 203  203  #define P_HASHWIN_MASK                  (seg_phashsize_win - 1)
 204  204  #define P_HASHWIRED_MASK                (seg_phashsize_wired - 1)
 205  205  #define P_BASESHIFT                     (6)
 206  206  
 207  207  kthread_t *seg_pasync_thr;
 208  208  
 209  209  extern struct seg_ops segvn_ops;
 210  210  extern struct seg_ops segspt_shmops;
 211  211  
 212  212  #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
 213  213  #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
 214  214  
 215  215  #define LBOLT_DELTA(t)  ((ulong_t)(ddi_get_lbolt() - (t)))
 216  216  
 217  217  #define PCP_AGE(pcp)    LBOLT_DELTA((pcp)->p_lbolt)
 218  218  
 219  219  /*
 220  220   * htag0 argument can be a seg or amp pointer.
 221  221   */
 222  222  #define P_HASHBP(seg, htag0, addr, flags)                               \
 223  223          (IS_PFLAGS_WIRED((flags)) ?                                     \
 224  224              ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
 225  225              ((uintptr_t)(htag0) >> P_BASESHIFT)]) :                     \
 226  226              (&seg_phashtab_win[P_HASHWIN_MASK &                         \
 227  227              (((uintptr_t)(htag0) >> 3) ^                                \
 228  228              ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?              \
 229  229              (flags >> 16) : page_get_shift((seg)->s_szc))))]))
 230  230  
 231  231  /*
 232  232   * htag0 argument can be a seg or amp pointer.
 233  233   */
 234  234  #define P_MATCH(pcp, htag0, addr, len)                                  \
 235  235          ((pcp)->p_htag0 == (htag0) &&                                   \
 236  236          (pcp)->p_addr == (addr) &&                                      \
 237  237          (pcp)->p_len >= (len))
 238  238  
 239  239  #define P_MATCH_PP(pcp, htag0, addr, len, pp)                           \
 240  240          ((pcp)->p_pp == (pp) &&                                         \
 241  241          (pcp)->p_htag0 == (htag0) &&                                    \
 242  242          (pcp)->p_addr == (addr) &&                                      \
 243  243          (pcp)->p_len >= (len))
 244  244  
 245  245  #define plink2pcache(pl)        ((struct seg_pcache *)((uintptr_t)(pl) - \
 246  246      offsetof(struct seg_pcache, p_plink)))
 247  247  
 248  248  #define hlink2phash(hl, l)      ((struct seg_phash *)((uintptr_t)(hl) - \
 249  249      offsetof(struct seg_phash, p_halink[l])))
 250  250  
 251  251  /*
 252  252   * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
 253  253   * active hash bucket lists. We maintain active bucket lists to reduce the
 254  254   * overhead of finding active buckets during asynchronous purging since there
 255  255   * can be 10s of millions of buckets on a large system but only a small subset
 256  256   * of them in actual use.
 257  257   *
 258  258   * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
 259  259   * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
 260  260   * buckets. The other list is used by asynchronous purge thread. This allows
 261  261   * the purge thread to walk its active list without holding seg_pmem_mtx for a
 262  262   * long time. When asynchronous thread is done with its list it switches to
 263  263   * current active list and makes the list it just finished processing as
 264  264   * current active list.
 265  265   *
 266  266   * seg_padd_abuck() only adds the bucket to current list if the bucket is not
 267  267   * yet on any list.  seg_premove_abuck() may remove the bucket from either
 268  268   * list. If the bucket is on current list it will be always removed. Otherwise
 269  269   * the bucket is only removed if asynchronous purge thread is not currently
 270  270   * running or seg_premove_abuck() is called by asynchronous purge thread
 271  271   * itself. A given bucket can only be on one of active lists at a time. These
 272  272   * routines should be called with per bucket lock held.  The routines use
 273  273   * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
 274  274   * the first entry is added to the bucket chain and seg_premove_abuck() must
 275  275   * be called after the last pcp entry is deleted from its chain. Per bucket
 276  276   * lock should be held by the callers.  This avoids a potential race condition
 277  277   * when seg_premove_abuck() removes a bucket after pcp entries are added to
 278  278   * its list after the caller checked that the bucket has no entries. (this
 279  279   * race would cause a loss of an active bucket from the active lists).
 280  280   *
 281  281   * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
 282  282   * New entries are added to the end of the list since LRU is used as the
 283  283   * purging policy.
 284  284   */
 285  285  static void
 286  286  seg_padd_abuck(struct seg_phash *hp)
 287  287  {
 288  288          int lix;
 289  289  
 290  290          ASSERT(MUTEX_HELD(&hp->p_hmutex));
 291  291          ASSERT((struct seg_phash *)hp->p_hnext != hp);
 292  292          ASSERT((struct seg_phash *)hp->p_hprev != hp);
 293  293          ASSERT(hp->p_hnext == hp->p_hprev);
 294  294          ASSERT(!IS_PCP_WIRED(hp->p_hnext));
 295  295          ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
 296  296          ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
 297  297          ASSERT(hp >= seg_phashtab_win &&
 298  298              hp < &seg_phashtab_win[seg_phashsize_win]);
 299  299  
 300  300          /*
 301  301           * This bucket can already be on one of active lists
 302  302           * since seg_premove_abuck() may have failed to remove it
 303  303           * before.
 304  304           */
 305  305          mutex_enter(&seg_pmem_mtx);
 306  306          lix = seg_pahcur;
 307  307          ASSERT(lix >= 0 && lix <= 1);
 308  308          if (hp->p_halink[lix].p_lnext != NULL) {
 309  309                  ASSERT(hp->p_halink[lix].p_lprev != NULL);
 310  310                  ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 311  311                  ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 312  312                  mutex_exit(&seg_pmem_mtx);
 313  313                  return;
 314  314          }
 315  315          ASSERT(hp->p_halink[lix].p_lprev == NULL);
 316  316  
 317  317          /*
 318  318           * If this bucket is still on list !lix async thread can't yet remove
 319  319           * it since we hold here per bucket lock. In this case just return
 320  320           * since async thread will eventually find and process this bucket.
 321  321           */
 322  322          if (hp->p_halink[!lix].p_lnext != NULL) {
 323  323                  ASSERT(hp->p_halink[!lix].p_lprev != NULL);
 324  324                  mutex_exit(&seg_pmem_mtx);
 325  325                  return;
 326  326          }
 327  327          ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 328  328          /*
 329  329           * This bucket is not on any active bucket list yet.
 330  330           * Add the bucket to the tail of current active list.
 331  331           */
 332  332          hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
 333  333          hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
 334  334          seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
 335  335          seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
 336  336          mutex_exit(&seg_pmem_mtx);
 337  337  }
 338  338  
 339  339  static void
 340  340  seg_premove_abuck(struct seg_phash *hp, int athr)
 341  341  {
 342  342          int lix;
 343  343  
 344  344          ASSERT(MUTEX_HELD(&hp->p_hmutex));
 345  345          ASSERT((struct seg_phash *)hp->p_hnext == hp);
 346  346          ASSERT((struct seg_phash *)hp->p_hprev == hp);
 347  347          ASSERT(hp >= seg_phashtab_win &&
 348  348              hp < &seg_phashtab_win[seg_phashsize_win]);
 349  349  
 350  350          if (athr) {
 351  351                  ASSERT(seg_pathr_on);
 352  352                  ASSERT(seg_pahcur <= 1);
 353  353                  /*
 354  354                   * We are called by asynchronous thread that found this bucket
 355  355                   * on not currently active (i.e. !seg_pahcur) list. Remove it
 356  356                   * from there.  Per bucket lock we are holding makes sure
 357  357                   * seg_pinsert() can't sneak in and add pcp entries to this
 358  358                   * bucket right before we remove the bucket from its list.
 359  359                   */
 360  360                  lix = !seg_pahcur;
 361  361                  ASSERT(hp->p_halink[lix].p_lnext != NULL);
 362  362                  ASSERT(hp->p_halink[lix].p_lprev != NULL);
 363  363                  ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 364  364                  ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 365  365                  hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 366  366                  hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 367  367                  hp->p_halink[lix].p_lnext = NULL;
 368  368                  hp->p_halink[lix].p_lprev = NULL;
 369  369                  return;
 370  370          }
 371  371  
 372  372          mutex_enter(&seg_pmem_mtx);
 373  373          lix = seg_pahcur;
 374  374          ASSERT(lix >= 0 && lix <= 1);
 375  375  
 376  376          /*
 377  377           * If the bucket is on currently active list just remove it from
 378  378           * there.
 379  379           */
 380  380          if (hp->p_halink[lix].p_lnext != NULL) {
 381  381                  ASSERT(hp->p_halink[lix].p_lprev != NULL);
 382  382                  ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 383  383                  ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 384  384                  hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 385  385                  hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 386  386                  hp->p_halink[lix].p_lnext = NULL;
 387  387                  hp->p_halink[lix].p_lprev = NULL;
 388  388                  mutex_exit(&seg_pmem_mtx);
 389  389                  return;
 390  390          }
 391  391          ASSERT(hp->p_halink[lix].p_lprev == NULL);
 392  392  
 393  393          /*
 394  394           * If asynchronous thread is not running we can remove the bucket from
 395  395           * not currently active list. The bucket must be on this list since we
 396  396           * already checked that it's not on the other list and the bucket from
 397  397           * which we just deleted the last pcp entry must be still on one of the
 398  398           * active bucket lists.
 399  399           */
 400  400          lix = !lix;
 401  401          ASSERT(hp->p_halink[lix].p_lnext != NULL);
 402  402          ASSERT(hp->p_halink[lix].p_lprev != NULL);
 403  403  
 404  404          if (!seg_pathr_on) {
 405  405                  hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 406  406                  hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 407  407                  hp->p_halink[lix].p_lnext = NULL;
 408  408                  hp->p_halink[lix].p_lprev = NULL;
 409  409          }
 410  410          mutex_exit(&seg_pmem_mtx);
 411  411  }
 412  412  
 413  413  /*
 414  414   * Check if bucket pointed by hp already has a pcp entry that matches request
 415  415   * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
 416  416   * Also delete matching entries that cover smaller address range but start
 417  417   * at the same address as addr argument. Return the list of deleted entries if
 418  418   * any. This is an internal helper function called from seg_pinsert() only
 419  419   * for non wired shadow lists. The caller already holds a per seg/amp list
 420  420   * lock.
 421  421   */
 422  422  static struct seg_pcache *
 423  423  seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
 424  424      caddr_t addr, size_t len, int *found)
 425  425  {
 426  426          struct seg_pcache *pcp;
 427  427          struct seg_pcache *delcallb_list = NULL;
 428  428  
 429  429          ASSERT(MUTEX_HELD(&hp->p_hmutex));
 430  430  
 431  431          *found = 0;
 432  432          for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 433  433              pcp = pcp->p_hnext) {
 434  434                  ASSERT(pcp->p_hashp == hp);
 435  435                  if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
 436  436                          ASSERT(!IS_PCP_WIRED(pcp));
 437  437                          if (pcp->p_len < len) {
 438  438                                  pcache_link_t *plinkp;
 439  439                                  if (pcp->p_active) {
 440  440                                          continue;
 441  441                                  }
 442  442                                  plinkp = &pcp->p_plink;
 443  443                                  plinkp->p_lprev->p_lnext = plinkp->p_lnext;
 444  444                                  plinkp->p_lnext->p_lprev = plinkp->p_lprev;
 445  445                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
 446  446                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
 447  447                                  pcp->p_hprev = delcallb_list;
 448  448                                  delcallb_list = pcp;
 449  449                          } else {
 450  450                                  *found = 1;
 451  451                                  break;
 452  452                          }
 453  453                  }
 454  454          }
 455  455          return (delcallb_list);
 456  456  }
 457  457  
 458  458  /*
 459  459   * lookup an address range in pagelock cache. Return shadow list and bump up
 460  460   * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
 461  461   * as a lookup tag.
 462  462   */
 463  463  struct page **
 464  464  seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 465  465      enum seg_rw rw, uint_t flags)
 466  466  {
 467  467          struct seg_pcache *pcp;
 468  468          struct seg_phash *hp;
 469  469          void *htag0;
 470  470  
 471  471          ASSERT(seg != NULL);
 472  472          ASSERT(rw == S_READ || rw == S_WRITE);
 473  473  
 474  474          /*
 475  475           * Skip pagelock cache, while DR is in progress or
 476  476           * seg_pcache is off.
 477  477           */
 478  478          if (seg_pdisabled) {
 479  479                  return (NULL);
 480  480          }
 481  481          ASSERT(seg_phashsize_win != 0);
 482  482  
 483  483          htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 484  484          hp = P_HASHBP(seg, htag0, addr, flags);
 485  485          mutex_enter(&hp->p_hmutex);
 486  486          for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 487  487              pcp = pcp->p_hnext) {
 488  488                  ASSERT(pcp->p_hashp == hp);
 489  489                  if (P_MATCH(pcp, htag0, addr, len)) {
 490  490                          ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 491  491                          /*
 492  492                           * If this request wants to write pages
 493  493                           * but write permissions starting from
 494  494                           * addr don't cover the entire length len
 495  495                           * return lookup failure back to the caller.
 496  496                           * It will check protections and fail this
 497  497                           * pagelock operation with EACCESS error.
 498  498                           */
 499  499                          if (rw == S_WRITE && pcp->p_wlen < len) {
 500  500                                  break;
 501  501                          }
 502  502                          if (pcp->p_active == UINT_MAX) {
 503  503                                  break;
 504  504                          }
 505  505                          pcp->p_active++;
 506  506                          if (rw == S_WRITE && !pcp->p_write) {
 507  507                                  pcp->p_write = 1;
 508  508                          }
 509  509                          mutex_exit(&hp->p_hmutex);
 510  510                          return (pcp->p_pp);
 511  511                  }
 512  512          }
 513  513          mutex_exit(&hp->p_hmutex);
 514  514          return (NULL);
 515  515  }
 516  516  
 517  517  /*
 518  518   * mark address range inactive. If the cache is off or the address range is
 519  519   * not in the cache or another shadow list that covers bigger range is found
 520  520   * we call the segment driver to reclaim the pages. Otherwise just decrement
 521  521   * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
 522  522   * otherwise use seg as a lookup tag.
 523  523   */
 524  524  void
 525  525  seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
 526  526      size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
 527  527      seg_preclaim_cbfunc_t callback)
 528  528  {
 529  529          struct seg_pcache *pcp;
 530  530          struct seg_phash *hp;
 531  531          kmutex_t *pmtx = NULL;
 532  532          pcache_link_t *pheadp;
 533  533          void *htag0;
 534  534          pgcnt_t npages = 0;
 535  535          int keep = 0;
 536  536  
 537  537          ASSERT(seg != NULL);
 538  538          ASSERT(rw == S_READ || rw == S_WRITE);
 539  539  
 540  540          htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 541  541  
 542  542          /*
 543  543           * Skip lookup if pcache is not configured.
 544  544           */
 545  545          if (seg_phashsize_win == 0) {
 546  546                  goto out;
 547  547          }
 548  548  
 549  549          /*
 550  550           * Grab per seg/amp lock before hash lock if we are going to remove
 551  551           * inactive entry from pcache.
 552  552           */
 553  553          if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
 554  554                  if (amp == NULL) {
 555  555                          pheadp = &seg->s_phead;
 556  556                          pmtx = &seg->s_pmtx;
 557  557                  } else {
 558  558                          pheadp = &amp->a_phead;
 559  559                          pmtx = &amp->a_pmtx;
 560  560                  }
 561  561                  mutex_enter(pmtx);
 562  562          }
 563  563  
 564  564          hp = P_HASHBP(seg, htag0, addr, flags);
 565  565          mutex_enter(&hp->p_hmutex);
 566  566  again:
 567  567          for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 568  568              pcp = pcp->p_hnext) {
 569  569                  ASSERT(pcp->p_hashp == hp);
 570  570                  if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
 571  571                          ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 572  572                          ASSERT(pcp->p_active);
 573  573                          if (keep) {
 574  574                                  /*
 575  575                                   * Don't remove this pcp entry
 576  576                                   * if we didn't find duplicate
 577  577                                   * shadow lists on second search.
 578  578                                   * Somebody removed those duplicates
 579  579                                   * since we dropped hash lock after first
 580  580                                   * search.
 581  581                                   */
 582  582                                  ASSERT(pmtx != NULL);
 583  583                                  ASSERT(!IS_PFLAGS_WIRED(flags));
 584  584                                  mutex_exit(pmtx);
 585  585                                  pmtx = NULL;
 586  586                          }
 587  587                          pcp->p_active--;
 588  588                          if (pcp->p_active == 0 && (pmtx != NULL ||
 589  589                              (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
 590  590  
 591  591                                  /*
 592  592                                   * This entry is no longer active.  Remove it
 593  593                                   * now either because pcaching is temporarily
 594  594                                   * disabled or there're other pcp entries that
 595  595                                   * can match this pagelock request (i.e. this
 596  596                                   * entry is a duplicate).
 597  597                                   */
 598  598  
 599  599                                  ASSERT(callback == pcp->p_callback);
 600  600                                  if (pmtx != NULL) {
 601  601                                          pcache_link_t *plinkp = &pcp->p_plink;
 602  602                                          ASSERT(!IS_PCP_WIRED(pcp));
 603  603                                          ASSERT(pheadp->p_lnext != pheadp);
 604  604                                          ASSERT(pheadp->p_lprev != pheadp);
 605  605                                          plinkp->p_lprev->p_lnext =
 606  606                                              plinkp->p_lnext;
 607  607                                          plinkp->p_lnext->p_lprev =
 608  608                                              plinkp->p_lprev;
 609  609                                  }
 610  610                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
 611  611                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
 612  612                                  if (!IS_PCP_WIRED(pcp) &&
 613  613                                      hp->p_hnext == (struct seg_pcache *)hp) {
 614  614                                          /*
 615  615                                           * We removed the last entry from this
 616  616                                           * bucket.  Now remove the bucket from
 617  617                                           * its active list.
 618  618                                           */
 619  619                                          seg_premove_abuck(hp, 0);
 620  620                                  }
 621  621                                  mutex_exit(&hp->p_hmutex);
 622  622                                  if (pmtx != NULL) {
 623  623                                          mutex_exit(pmtx);
 624  624                                  }
 625  625                                  len = pcp->p_len;
 626  626                                  npages = btop(len);
 627  627                                  if (rw != S_WRITE && pcp->p_write) {
 628  628                                          rw = S_WRITE;
 629  629                                  }
 630  630                                  kmem_cache_free(seg_pkmcache, pcp);
 631  631                                  goto out;
 632  632                          } else {
 633  633                                  /*
 634  634                                   * We found a matching pcp entry but will not
 635  635                                   * free it right away even if it's no longer
 636  636                                   * active.
 637  637                                   */
 638  638                                  if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
 639  639                                          /*
 640  640                                           * Set the reference bit and mark the
 641  641                                           * time of last access to this pcp
 642  642                                           * so that asynchronous thread doesn't
 643  643                                           * free it immediately since
 644  644                                           * it may be reactivated very soon.
 645  645                                           */
 646  646                                          pcp->p_lbolt = ddi_get_lbolt();
 647  647                                          pcp->p_ref = 1;
 648  648                                  }
 649  649                                  mutex_exit(&hp->p_hmutex);
 650  650                                  if (pmtx != NULL) {
 651  651                                          mutex_exit(pmtx);
 652  652                                  }
 653  653                                  return;
 654  654                          }
 655  655                  } else if (!IS_PFLAGS_WIRED(flags) &&
 656  656                      P_MATCH(pcp, htag0, addr, len)) {
 657  657                          /*
 658  658                           * This is a duplicate pcp entry.  This situation may
 659  659                           * happen if a bigger shadow list that covers our
 660  660                           * range was added while our entry was still active.
 661  661                           * Now we can free our pcp entry if it becomes
 662  662                           * inactive.
 663  663                           */
 664  664                          if (!pcp->p_active) {
 665  665                                  /*
 666  666                                   * Mark this entry as referenced just in case
 667  667                                   * we'll free our own pcp entry soon.
 668  668                                   */
 669  669                                  pcp->p_lbolt = ddi_get_lbolt();
 670  670                                  pcp->p_ref = 1;
 671  671                          }
 672  672                          if (pmtx != NULL) {
 673  673                                  /*
 674  674                                   * we are already holding pmtx and found a
 675  675                                   * duplicate.  Don't keep our own pcp entry.
 676  676                                   */
 677  677                                  keep = 0;
 678  678                                  continue;
 679  679                          }
 680  680                          /*
 681  681                           * We have to use mutex_tryenter to attempt to lock
 682  682                           * seg/amp list lock since we already hold hash lock
 683  683                           * and seg/amp list lock is above hash lock in lock
 684  684                           * order.  If mutex_tryenter fails drop hash lock and
 685  685                           * retake both locks in correct order and research
 686  686                           * this hash chain.
 687  687                           */
 688  688                          ASSERT(keep == 0);
 689  689                          if (amp == NULL) {
 690  690                                  pheadp = &seg->s_phead;
 691  691                                  pmtx = &seg->s_pmtx;
 692  692                          } else {
 693  693                                  pheadp = &amp->a_phead;
 694  694                                  pmtx = &amp->a_pmtx;
 695  695                          }
 696  696                          if (!mutex_tryenter(pmtx)) {
 697  697                                  mutex_exit(&hp->p_hmutex);
 698  698                                  mutex_enter(pmtx);
 699  699                                  mutex_enter(&hp->p_hmutex);
 700  700                                  /*
 701  701                                   * If we don't find bigger shadow list on
 702  702                                   * second search (it may happen since we
 703  703                                   * dropped bucket lock) keep the entry that
 704  704                                   * matches our own shadow list.
 705  705                                   */
 706  706                                  keep = 1;
 707  707                                  goto again;
 708  708                          }
 709  709                  }
 710  710          }
 711  711          mutex_exit(&hp->p_hmutex);
 712  712          if (pmtx != NULL) {
 713  713                  mutex_exit(pmtx);
 714  714          }
 715  715  out:
 716  716          (*callback)(htag0, addr, len, pp, rw, 0);
 717  717          if (npages) {
 718  718                  mutex_enter(&seg_pmem_mtx);
 719  719                  ASSERT(seg_plocked >= npages);
 720  720                  seg_plocked -= npages;
 721  721                  if (!IS_PFLAGS_WIRED(flags)) {
 722  722                          ASSERT(seg_plocked_window >= npages);
 723  723                          seg_plocked_window -= npages;
 724  724                  }
 725  725                  mutex_exit(&seg_pmem_mtx);
 726  726          }
 727  727  
 728  728  }
 729  729  
 730  730  #ifdef DEBUG
 731  731  static uint32_t p_insert_chk_mtbf = 0;
 732  732  #endif
 733  733  
 734  734  /*
 735  735   * The seg_pinsert_check() is used by segment drivers to predict whether
 736  736   * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
 737  737   */
 738  738  /*ARGSUSED*/
 739  739  int
 740  740  seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
 741  741      size_t len, uint_t flags)
 742  742  {
 743  743          ASSERT(seg != NULL);
 744  744  
 745  745  #ifdef DEBUG
 746  746          if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
 747  747                  return (SEGP_FAIL);
 748  748          }
 749  749  #endif
 750  750  
 751  751          if (seg_pdisabled) {
 752  752                  return (SEGP_FAIL);
 753  753          }
 754  754          ASSERT(seg_phashsize_win != 0);
 755  755  
 756  756          if (IS_PFLAGS_WIRED(flags)) {
 757  757                  return (SEGP_SUCCESS);
 758  758          }
 759  759  
 760  760          if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
 761  761                  return (SEGP_FAIL);
 762  762          }
 763  763  
 764  764          if (freemem < desfree) {
 765  765                  return (SEGP_FAIL);
 766  766          }
 767  767  
 768  768          return (SEGP_SUCCESS);
 769  769  }
 770  770  
 771  771  #ifdef DEBUG
 772  772  static uint32_t p_insert_mtbf = 0;
 773  773  #endif
 774  774  
 775  775  /*
 776  776   * Insert address range with shadow list into pagelock cache if there's no
 777  777   * shadow list already cached for this address range. If the cache is off or
 778  778   * caching is temporarily disabled or the allowed 'window' is exceeded return
 779  779   * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
 780  780   *
 781  781   * For non wired shadow lists (segvn case) include address in the hashing
 782  782   * function to avoid linking all the entries from the same segment or amp on
 783  783   * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
 784  784   * pcache entries are also linked on a per segment/amp list so that all
 785  785   * entries can be found quickly during seg/amp purge without walking the
 786  786   * entire pcache hash table.  For wired shadow lists (segspt case) we
 787  787   * don't use address hashing and per segment linking because the caller
 788  788   * currently inserts only one entry per segment that covers the entire
 789  789   * segment. If we used per segment linking even for segspt it would complicate
 790  790   * seg_ppurge_wiredpp() locking.
 791  791   *
 792  792   * Both hash bucket and per seg/amp locks need to be held before adding a non
 793  793   * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
 794  794   * first.
 795  795   *
 796  796   * This function will also remove from pcache old inactive shadow lists that
 797  797   * overlap with this request but cover smaller range for the same start
 798  798   * address.
 799  799   */
 800  800  int
 801  801  seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 802  802      size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
 803  803      seg_preclaim_cbfunc_t callback)
 804  804  {
 805  805          struct seg_pcache *pcp;
 806  806          struct seg_phash *hp;
 807  807          pgcnt_t npages;
 808  808          pcache_link_t *pheadp;
 809  809          kmutex_t *pmtx;
 810  810          struct seg_pcache *delcallb_list = NULL;
 811  811  
 812  812          ASSERT(seg != NULL);
 813  813          ASSERT(rw == S_READ || rw == S_WRITE);
 814  814          ASSERT(rw == S_READ || wlen == len);
 815  815          ASSERT(rw == S_WRITE || wlen <= len);
 816  816          ASSERT(amp == NULL || wlen == len);
 817  817  
 818  818  #ifdef DEBUG
 819  819          if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
 820  820                  return (SEGP_FAIL);
 821  821          }
 822  822  #endif
 823  823  
 824  824          if (seg_pdisabled) {
 825  825                  return (SEGP_FAIL);
 826  826          }
 827  827          ASSERT(seg_phashsize_win != 0);
 828  828  
 829  829          ASSERT((len & PAGEOFFSET) == 0);
 830  830          npages = btop(len);
 831  831          mutex_enter(&seg_pmem_mtx);
 832  832          if (!IS_PFLAGS_WIRED(flags)) {
 833  833                  if (seg_plocked_window + npages > seg_pmaxwindow) {
 834  834                          mutex_exit(&seg_pmem_mtx);
 835  835                          return (SEGP_FAIL);
 836  836                  }
 837  837                  seg_plocked_window += npages;
 838  838          }
 839  839          seg_plocked += npages;
 840  840          mutex_exit(&seg_pmem_mtx);
 841  841  
 842  842          pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
 843  843          /*
 844  844           * If amp is not NULL set htag0 to amp otherwise set it to seg.
 845  845           */
 846  846          if (amp == NULL) {
 847  847                  pcp->p_htag0 = (void *)seg;
 848  848                  pcp->p_flags = flags & 0xffff;
 849  849          } else {
 850  850                  pcp->p_htag0 = (void *)amp;
 851  851                  pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
 852  852          }
 853  853          pcp->p_addr = addr;
 854  854          pcp->p_len = len;
 855  855          pcp->p_wlen = wlen;
 856  856          pcp->p_pp = pp;
 857  857          pcp->p_write = (rw == S_WRITE);
 858  858          pcp->p_callback = callback;
 859  859          pcp->p_active = 1;
 860  860  
 861  861          hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
 862  862          if (!IS_PFLAGS_WIRED(flags)) {
 863  863                  int found;
 864  864                  void *htag0;
 865  865                  if (amp == NULL) {
 866  866                          pheadp = &seg->s_phead;
 867  867                          pmtx = &seg->s_pmtx;
 868  868                          htag0 = (void *)seg;
 869  869                  } else {
 870  870                          pheadp = &amp->a_phead;
 871  871                          pmtx = &amp->a_pmtx;
 872  872                          htag0 = (void *)amp;
 873  873                  }
 874  874                  mutex_enter(pmtx);
 875  875                  mutex_enter(&hp->p_hmutex);
 876  876                  delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
 877  877                      len, &found);
 878  878                  if (found) {
 879  879                          mutex_exit(&hp->p_hmutex);
 880  880                          mutex_exit(pmtx);
 881  881                          mutex_enter(&seg_pmem_mtx);
 882  882                          seg_plocked -= npages;
 883  883                          seg_plocked_window -= npages;
 884  884                          mutex_exit(&seg_pmem_mtx);
 885  885                          kmem_cache_free(seg_pkmcache, pcp);
 886  886                          goto out;
 887  887                  }
 888  888                  pcp->p_plink.p_lnext = pheadp->p_lnext;
 889  889                  pcp->p_plink.p_lprev = pheadp;
 890  890                  pheadp->p_lnext->p_lprev = &pcp->p_plink;
 891  891                  pheadp->p_lnext = &pcp->p_plink;
 892  892          } else {
 893  893                  mutex_enter(&hp->p_hmutex);
 894  894          }
 895  895          pcp->p_hashp = hp;
 896  896          pcp->p_hnext = hp->p_hnext;
 897  897          pcp->p_hprev = (struct seg_pcache *)hp;
 898  898          hp->p_hnext->p_hprev = pcp;
 899  899          hp->p_hnext = pcp;
 900  900          if (!IS_PFLAGS_WIRED(flags) &&
 901  901              hp->p_hprev == pcp) {
 902  902                  seg_padd_abuck(hp);
 903  903          }
 904  904          mutex_exit(&hp->p_hmutex);
 905  905          if (!IS_PFLAGS_WIRED(flags)) {
 906  906                  mutex_exit(pmtx);
 907  907          }
 908  908  
 909  909  out:
 910  910          npages = 0;
 911  911          while (delcallb_list != NULL) {
 912  912                  pcp = delcallb_list;
 913  913                  delcallb_list = pcp->p_hprev;
 914  914                  ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
 915  915                  (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
 916  916                      pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
 917  917                  npages += btop(pcp->p_len);
 918  918                  kmem_cache_free(seg_pkmcache, pcp);
 919  919          }
 920  920          if (npages) {
 921  921                  ASSERT(!IS_PFLAGS_WIRED(flags));
 922  922                  mutex_enter(&seg_pmem_mtx);
 923  923                  ASSERT(seg_plocked >= npages);
 924  924                  ASSERT(seg_plocked_window >= npages);
 925  925                  seg_plocked -= npages;
 926  926                  seg_plocked_window -= npages;
 927  927                  mutex_exit(&seg_pmem_mtx);
 928  928          }
 929  929  
 930  930          return (SEGP_SUCCESS);
 931  931  }
 932  932  
 933  933  /*
 934  934   * purge entries from the pagelock cache if not active
 935  935   * and not recently used.
 936  936   */
 937  937  static void
 938  938  seg_ppurge_async(int force)
 939  939  {
 940  940          struct seg_pcache *delcallb_list = NULL;
 941  941          struct seg_pcache *pcp;
 942  942          struct seg_phash *hp;
 943  943          pgcnt_t npages = 0;
 944  944          pgcnt_t npages_window = 0;
 945  945          pgcnt_t npgs_to_purge;
 946  946          pgcnt_t npgs_purged = 0;
 947  947          int hlinks = 0;
 948  948          int hlix;
 949  949          pcache_link_t *hlinkp;
 950  950          pcache_link_t *hlnextp = NULL;
 951  951          int lowmem;
 952  952          int trim;
 953  953  
 954  954          ASSERT(seg_phashsize_win != 0);
 955  955  
 956  956          /*
 957  957           * if the cache is off or empty, return
 958  958           */
 959  959          if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
 960  960                  return;
 961  961          }
 962  962  
 963  963          if (!force) {
 964  964                  lowmem = 0;
 965  965                  trim = 0;
 966  966                  if (freemem < lotsfree + needfree) {
 967  967                          spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
 968  968                          if (fmem <= 5 * (desfree >> 2)) {
 969  969                                  lowmem = 1;
 970  970                          } else if (fmem <= 7 * (lotsfree >> 3)) {
 971  971                                  if (seg_plocked_window >=
 972  972                                      (availrmem_initial >> 1)) {
 973  973                                          lowmem = 1;
 974  974                                  }
 975  975                          } else if (fmem < lotsfree) {
 976  976                                  if (seg_plocked_window >=
 977  977                                      3 * (availrmem_initial >> 2)) {
 978  978                                          lowmem = 1;
 979  979                                  }
 980  980                          }
 981  981                  }
 982  982                  if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
 983  983                          trim = 1;
 984  984                  }
 985  985                  if (!lowmem && !trim) {
 986  986                          return;
 987  987                  }
 988  988                  npgs_to_purge = seg_plocked_window >>
 989  989                      seg_pshrink_shift;
 990  990                  if (lowmem) {
 991  991                          npgs_to_purge = MIN(npgs_to_purge,
 992  992                              MAX(seg_pmaxapurge_npages, desfree));
 993  993                  } else {
 994  994                          npgs_to_purge = MIN(npgs_to_purge,
 995  995                              seg_pmaxapurge_npages);
 996  996                  }
 997  997                  if (npgs_to_purge == 0) {
 998  998                          return;
 999  999                  }
1000 1000          } else {
1001 1001                  struct seg_phash_wired *hpw;
1002 1002  
1003 1003                  ASSERT(seg_phashsize_wired != 0);
1004 1004  
1005 1005                  for (hpw = seg_phashtab_wired;
1006 1006                      hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1007 1007  
1008 1008                          if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1009 1009                                  continue;
1010 1010                          }
1011 1011  
1012 1012                          mutex_enter(&hpw->p_hmutex);
1013 1013  
1014 1014                          for (pcp = hpw->p_hnext;
1015 1015                              pcp != (struct seg_pcache *)hpw;
1016 1016                              pcp = pcp->p_hnext) {
1017 1017  
1018 1018                                  ASSERT(IS_PCP_WIRED(pcp));
1019 1019                                  ASSERT(pcp->p_hashp ==
1020 1020                                      (struct seg_phash *)hpw);
1021 1021  
1022 1022                                  if (pcp->p_active) {
1023 1023                                          continue;
1024 1024                                  }
1025 1025                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
1026 1026                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
1027 1027                                  pcp->p_hprev = delcallb_list;
1028 1028                                  delcallb_list = pcp;
1029 1029                          }
1030 1030                          mutex_exit(&hpw->p_hmutex);
1031 1031                  }
1032 1032          }
1033 1033  
1034 1034          mutex_enter(&seg_pmem_mtx);
1035 1035          if (seg_pathr_on) {
1036 1036                  mutex_exit(&seg_pmem_mtx);
1037 1037                  goto runcb;
1038 1038          }
1039 1039          seg_pathr_on = 1;
1040 1040          mutex_exit(&seg_pmem_mtx);
1041 1041          ASSERT(seg_pahcur <= 1);
1042 1042          hlix = !seg_pahcur;
1043 1043  
1044 1044  again:
1045 1045          for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1046 1046              hlinkp = hlnextp) {
1047 1047  
1048 1048                  hlnextp = hlinkp->p_lnext;
1049 1049                  ASSERT(hlnextp != NULL);
1050 1050  
1051 1051                  hp = hlink2phash(hlinkp, hlix);
1052 1052                  if (hp->p_hnext == (struct seg_pcache *)hp) {
1053 1053                          seg_pathr_empty_ahb++;
1054 1054                          continue;
1055 1055                  }
1056 1056                  seg_pathr_full_ahb++;
1057 1057                  mutex_enter(&hp->p_hmutex);
1058 1058  
1059 1059                  for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1060 1060                      pcp = pcp->p_hnext) {
1061 1061                          pcache_link_t *pheadp;
1062 1062                          pcache_link_t *plinkp;
1063 1063                          void *htag0;
1064 1064                          kmutex_t *pmtx;
1065 1065  
1066 1066                          ASSERT(!IS_PCP_WIRED(pcp));
1067 1067                          ASSERT(pcp->p_hashp == hp);
1068 1068  
1069 1069                          if (pcp->p_active) {
1070 1070                                  continue;
1071 1071                          }
1072 1072                          if (!force && pcp->p_ref &&
1073 1073                              PCP_AGE(pcp) < seg_pmax_pcpage) {
1074 1074                                  pcp->p_ref = 0;
1075 1075                                  continue;
1076 1076                          }
1077 1077                          plinkp = &pcp->p_plink;
1078 1078                          htag0 = pcp->p_htag0;
1079 1079                          if (pcp->p_flags & SEGP_AMP) {
1080 1080                                  pheadp = &((amp_t *)htag0)->a_phead;
1081 1081                                  pmtx = &((amp_t *)htag0)->a_pmtx;
1082 1082                          } else {
1083 1083                                  pheadp = &((seg_t *)htag0)->s_phead;
1084 1084                                  pmtx = &((seg_t *)htag0)->s_pmtx;
1085 1085                          }
1086 1086                          if (!mutex_tryenter(pmtx)) {
1087 1087                                  continue;
1088 1088                          }
1089 1089                          ASSERT(pheadp->p_lnext != pheadp);
1090 1090                          ASSERT(pheadp->p_lprev != pheadp);
1091 1091                          plinkp->p_lprev->p_lnext =
1092 1092                              plinkp->p_lnext;
1093 1093                          plinkp->p_lnext->p_lprev =
1094 1094                              plinkp->p_lprev;
1095 1095                          pcp->p_hprev->p_hnext = pcp->p_hnext;
1096 1096                          pcp->p_hnext->p_hprev = pcp->p_hprev;
1097 1097                          mutex_exit(pmtx);
1098 1098                          pcp->p_hprev = delcallb_list;
1099 1099                          delcallb_list = pcp;
1100 1100                          npgs_purged += btop(pcp->p_len);
1101 1101                  }
1102 1102                  if (hp->p_hnext == (struct seg_pcache *)hp) {
1103 1103                          seg_premove_abuck(hp, 1);
1104 1104                  }
1105 1105                  mutex_exit(&hp->p_hmutex);
1106 1106                  if (npgs_purged >= seg_plocked_window) {
1107 1107                          break;
1108 1108                  }
1109 1109                  if (!force) {
1110 1110                          if (npgs_purged >= npgs_to_purge) {
1111 1111                                  break;
1112 1112                          }
1113 1113                          if (!trim && !(seg_pathr_full_ahb & 15)) {
1114 1114                                  ASSERT(lowmem);
1115 1115                                  if (freemem >= lotsfree + needfree) {
1116 1116                                          break;
1117 1117                                  }
1118 1118                          }
1119 1119                  }
1120 1120          }
1121 1121  
1122 1122          if (hlinkp == &seg_pahhead[hlix]) {
1123 1123                  /*
1124 1124                   * We processed the entire hlix active bucket list
1125 1125                   * but didn't find enough pages to reclaim.
1126 1126                   * Switch the lists and walk the other list
1127 1127                   * if we haven't done it yet.
1128 1128                   */
1129 1129                  mutex_enter(&seg_pmem_mtx);
1130 1130                  ASSERT(seg_pathr_on);
1131 1131                  ASSERT(seg_pahcur == !hlix);
1132 1132                  seg_pahcur = hlix;
1133 1133                  mutex_exit(&seg_pmem_mtx);
1134 1134                  if (++hlinks < 2) {
1135 1135                          hlix = !hlix;
1136 1136                          goto again;
1137 1137                  }
1138 1138          } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1139 1139              seg_pahhead[hlix].p_lnext != hlinkp) {
1140 1140                  ASSERT(hlinkp != NULL);
1141 1141                  ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1142 1142                  ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1143 1143                  ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1144 1144  
1145 1145                  /*
1146 1146                   * Reinsert the header to point to hlinkp
1147 1147                   * so that we start from hlinkp bucket next time around.
1148 1148                   */
1149 1149                  seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1150 1150                  seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1151 1151                  seg_pahhead[hlix].p_lnext = hlinkp;
1152 1152                  seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1153 1153                  hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1154 1154                  hlinkp->p_lprev = &seg_pahhead[hlix];
1155 1155          }
1156 1156  
1157 1157          mutex_enter(&seg_pmem_mtx);
1158 1158          ASSERT(seg_pathr_on);
1159 1159          seg_pathr_on = 0;
1160 1160          mutex_exit(&seg_pmem_mtx);
1161 1161  
1162 1162  runcb:
1163 1163          /*
1164 1164           * Run the delayed callback list. segments/amps can't go away until
1165 1165           * callback is executed since they must have non 0 softlockcnt. That's
1166 1166           * why we don't need to hold as/seg/amp locks to execute the callback.
1167 1167           */
1168 1168          while (delcallb_list != NULL) {
1169 1169                  pcp = delcallb_list;
1170 1170                  delcallb_list = pcp->p_hprev;
1171 1171                  ASSERT(!pcp->p_active);
1172 1172                  (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1173 1173                      pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1174 1174                  npages += btop(pcp->p_len);
1175 1175                  if (!IS_PCP_WIRED(pcp)) {
1176 1176                          npages_window += btop(pcp->p_len);
1177 1177                  }
1178 1178                  kmem_cache_free(seg_pkmcache, pcp);
1179 1179          }
1180 1180          if (npages) {
1181 1181                  mutex_enter(&seg_pmem_mtx);
1182 1182                  ASSERT(seg_plocked >= npages);
1183 1183                  ASSERT(seg_plocked_window >= npages_window);
1184 1184                  seg_plocked -= npages;
1185 1185                  seg_plocked_window -= npages_window;
1186 1186                  mutex_exit(&seg_pmem_mtx);
1187 1187          }
1188 1188  }
1189 1189  
1190 1190  /*
1191 1191   * Remove cached pages for segment(s) entries from hashtable.  The segments
1192 1192   * are identified by pp array. This is useful for multiple seg's cached on
1193 1193   * behalf of dummy segment (ISM/DISM) with common pp array.
1194 1194   */
1195 1195  void
1196 1196  seg_ppurge_wiredpp(struct page **pp)
1197 1197  {
1198 1198          struct seg_pcache *pcp;
1199 1199          struct seg_phash_wired *hp;
1200 1200          pgcnt_t npages = 0;
1201 1201          struct  seg_pcache *delcallb_list = NULL;
1202 1202  
1203 1203          /*
1204 1204           * if the cache is empty, return
1205 1205           */
1206 1206          if (seg_plocked == 0) {
1207 1207                  return;
1208 1208          }
1209 1209          ASSERT(seg_phashsize_wired != 0);
1210 1210  
1211 1211          for (hp = seg_phashtab_wired;
1212 1212              hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1213 1213                  if (hp->p_hnext == (struct seg_pcache *)hp) {
1214 1214                          continue;
1215 1215                  }
1216 1216                  mutex_enter(&hp->p_hmutex);
1217 1217                  pcp = hp->p_hnext;
1218 1218                  while (pcp != (struct seg_pcache *)hp) {
1219 1219                          ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1220 1220                          ASSERT(IS_PCP_WIRED(pcp));
1221 1221                          /*
1222 1222                           * purge entries which are not active
1223 1223                           */
1224 1224                          if (!pcp->p_active && pcp->p_pp == pp) {
1225 1225                                  ASSERT(pcp->p_htag0 != NULL);
1226 1226                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
1227 1227                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
1228 1228                                  pcp->p_hprev = delcallb_list;
1229 1229                                  delcallb_list = pcp;
1230 1230                          }
1231 1231                          pcp = pcp->p_hnext;
1232 1232                  }
1233 1233                  mutex_exit(&hp->p_hmutex);
1234 1234                  /*
1235 1235                   * segments can't go away until callback is executed since
1236 1236                   * they must have non 0 softlockcnt. That's why we don't
1237 1237                   * need to hold as/seg locks to execute the callback.
1238 1238                   */
1239 1239                  while (delcallb_list != NULL) {
1240 1240                          int done;
1241 1241                          pcp = delcallb_list;
1242 1242                          delcallb_list = pcp->p_hprev;
1243 1243                          ASSERT(!pcp->p_active);
1244 1244                          done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1245 1245                              pcp->p_len, pcp->p_pp,
1246 1246                              pcp->p_write ? S_WRITE : S_READ, 1);
1247 1247                          npages += btop(pcp->p_len);
1248 1248                          ASSERT(IS_PCP_WIRED(pcp));
1249 1249                          kmem_cache_free(seg_pkmcache, pcp);
1250 1250                          if (done) {
1251 1251                                  ASSERT(delcallb_list == NULL);
1252 1252                                  goto out;
1253 1253                          }
1254 1254                  }
1255 1255          }
1256 1256  
1257 1257  out:
1258 1258          mutex_enter(&seg_pmem_mtx);
1259 1259          ASSERT(seg_plocked >= npages);
1260 1260          seg_plocked -= npages;
1261 1261          mutex_exit(&seg_pmem_mtx);
1262 1262  }
1263 1263  
1264 1264  /*
1265 1265   * purge all entries for a given segment. Since we
1266 1266   * callback into the segment driver directly for page
1267 1267   * reclaim the caller needs to hold the right locks.
1268 1268   */
1269 1269  void
1270 1270  seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1271 1271  {
1272 1272          struct seg_pcache *delcallb_list = NULL;
1273 1273          struct seg_pcache *pcp;
1274 1274          struct seg_phash *hp;
1275 1275          pgcnt_t npages = 0;
1276 1276          void *htag0;
1277 1277  
1278 1278          if (seg_plocked == 0) {
1279 1279                  return;
1280 1280          }
1281 1281          ASSERT(seg_phashsize_win != 0);
1282 1282  
1283 1283          /*
1284 1284           * If amp is not NULL use amp as a lookup tag otherwise use seg
1285 1285           * as a lookup tag.
1286 1286           */
1287 1287          htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1288 1288          ASSERT(htag0 != NULL);
1289 1289          if (IS_PFLAGS_WIRED(flags)) {
1290 1290                  hp = P_HASHBP(seg, htag0, 0, flags);
1291 1291                  mutex_enter(&hp->p_hmutex);
1292 1292                  pcp = hp->p_hnext;
1293 1293                  while (pcp != (struct seg_pcache *)hp) {
1294 1294                          ASSERT(pcp->p_hashp == hp);
1295 1295                          ASSERT(IS_PCP_WIRED(pcp));
1296 1296                          if (pcp->p_htag0 == htag0) {
1297 1297                                  if (pcp->p_active) {
1298 1298                                          break;
1299 1299                                  }
1300 1300                                  pcp->p_hprev->p_hnext = pcp->p_hnext;
1301 1301                                  pcp->p_hnext->p_hprev = pcp->p_hprev;
1302 1302                                  pcp->p_hprev = delcallb_list;
1303 1303                                  delcallb_list = pcp;
1304 1304                          }
1305 1305                          pcp = pcp->p_hnext;
1306 1306                  }
1307 1307                  mutex_exit(&hp->p_hmutex);
1308 1308          } else {
1309 1309                  pcache_link_t *plinkp;
1310 1310                  pcache_link_t *pheadp;
1311 1311                  kmutex_t *pmtx;
1312 1312  
1313 1313                  if (amp == NULL) {
1314 1314                          ASSERT(seg != NULL);
1315 1315                          pheadp = &seg->s_phead;
1316 1316                          pmtx = &seg->s_pmtx;
1317 1317                  } else {
1318 1318                          pheadp = &amp->a_phead;
1319 1319                          pmtx = &amp->a_pmtx;
1320 1320                  }
1321 1321                  mutex_enter(pmtx);
1322 1322                  while ((plinkp = pheadp->p_lnext) != pheadp) {
1323 1323                          pcp = plink2pcache(plinkp);
1324 1324                          ASSERT(!IS_PCP_WIRED(pcp));
1325 1325                          ASSERT(pcp->p_htag0 == htag0);
1326 1326                          hp = pcp->p_hashp;
1327 1327                          mutex_enter(&hp->p_hmutex);
1328 1328                          if (pcp->p_active) {
1329 1329                                  mutex_exit(&hp->p_hmutex);
1330 1330                                  break;
1331 1331                          }
1332 1332                          ASSERT(plinkp->p_lprev == pheadp);
1333 1333                          pheadp->p_lnext = plinkp->p_lnext;
1334 1334                          plinkp->p_lnext->p_lprev = pheadp;
1335 1335                          pcp->p_hprev->p_hnext = pcp->p_hnext;
1336 1336                          pcp->p_hnext->p_hprev = pcp->p_hprev;
1337 1337                          pcp->p_hprev = delcallb_list;
1338 1338                          delcallb_list = pcp;
1339 1339                          if (hp->p_hnext == (struct seg_pcache *)hp) {
1340 1340                                  seg_premove_abuck(hp, 0);
1341 1341                          }
1342 1342                          mutex_exit(&hp->p_hmutex);
1343 1343                  }
1344 1344                  mutex_exit(pmtx);
1345 1345          }
1346 1346          while (delcallb_list != NULL) {
1347 1347                  pcp = delcallb_list;
1348 1348                  delcallb_list = pcp->p_hprev;
1349 1349                  ASSERT(!pcp->p_active);
1350 1350                  (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1351 1351                      pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1352 1352                  npages += btop(pcp->p_len);
1353 1353                  kmem_cache_free(seg_pkmcache, pcp);
1354 1354          }
1355 1355          mutex_enter(&seg_pmem_mtx);
1356 1356          ASSERT(seg_plocked >= npages);
1357 1357          seg_plocked -= npages;
1358 1358          if (!IS_PFLAGS_WIRED(flags)) {
1359 1359                  ASSERT(seg_plocked_window >= npages);
1360 1360                  seg_plocked_window -= npages;
1361 1361          }
1362 1362          mutex_exit(&seg_pmem_mtx);
1363 1363  }
1364 1364  
1365 1365  static void seg_pinit_mem_config(void);
1366 1366  
1367 1367  /*
1368 1368   * setup the pagelock cache
1369 1369   */
1370 1370  static void
1371 1371  seg_pinit(void)
1372 1372  {
1373 1373          struct seg_phash *hp;
1374 1374          ulong_t i;
1375 1375          pgcnt_t physmegs;
1376 1376  
1377 1377          seg_plocked = 0;
1378 1378          seg_plocked_window = 0;
1379 1379  
1380 1380          if (segpcache_enabled == 0) {
1381 1381                  seg_phashsize_win = 0;
1382 1382                  seg_phashsize_wired = 0;
1383 1383                  seg_pdisabled = 1;
1384 1384                  return;
1385 1385          }
1386 1386  
1387 1387          seg_pdisabled = 0;
1388 1388          seg_pkmcache = kmem_cache_create("seg_pcache",
1389 1389              sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1390 1390          if (segpcache_pcp_maxage_ticks <= 0) {
1391 1391                  segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1392 1392          }
1393 1393          seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1394 1394          seg_pathr_empty_ahb = 0;
1395 1395          seg_pathr_full_ahb = 0;
1396 1396          seg_pshrink_shift = segpcache_shrink_shift;
1397 1397          seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1398 1398  
1399 1399          mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 1400          mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 1401          mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1402 1402          cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1403 1403  
1404 1404          physmegs = physmem >> (20 - PAGESHIFT);
1405 1405  
1406 1406          /*
1407 1407           * If segpcache_hashsize_win was not set in /etc/system or it has
1408 1408           * absurd value set it to a default.
1409 1409           */
1410 1410          if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1411 1411                  /*
1412 1412                   * Create one bucket per 32K (or at least per 8 pages) of
1413 1413                   * available memory.
1414 1414                   */
1415 1415                  pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1416 1416                  segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1417 1417          }
1418 1418          if (!ISP2(segpcache_hashsize_win)) {
1419 1419                  ulong_t rndfac = ~(1UL <<
1420 1420                      (highbit(segpcache_hashsize_win) - 1));
1421 1421                  rndfac &= segpcache_hashsize_win;
1422 1422                  segpcache_hashsize_win += rndfac;
1423 1423                  segpcache_hashsize_win = 1 <<
1424 1424                      (highbit(segpcache_hashsize_win) - 1);
1425 1425          }
1426 1426          seg_phashsize_win = segpcache_hashsize_win;
1427 1427          seg_phashtab_win = kmem_zalloc(
1428 1428              seg_phashsize_win * sizeof (struct seg_phash),
1429 1429              KM_SLEEP);
1430 1430          for (i = 0; i < seg_phashsize_win; i++) {
1431 1431                  hp = &seg_phashtab_win[i];
1432 1432                  hp->p_hnext = (struct seg_pcache *)hp;
1433 1433                  hp->p_hprev = (struct seg_pcache *)hp;
1434 1434                  mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1435 1435          }
1436 1436  
1437 1437          seg_pahcur = 0;
1438 1438          seg_pathr_on = 0;
1439 1439          seg_pahhead[0].p_lnext = &seg_pahhead[0];
1440 1440          seg_pahhead[0].p_lprev = &seg_pahhead[0];
1441 1441          seg_pahhead[1].p_lnext = &seg_pahhead[1];
1442 1442          seg_pahhead[1].p_lprev = &seg_pahhead[1];
1443 1443  
1444 1444          /*
1445 1445           * If segpcache_hashsize_wired was not set in /etc/system or it has
1446 1446           * absurd value set it to a default.
1447 1447           */
1448 1448          if (segpcache_hashsize_wired == 0 ||
1449 1449              segpcache_hashsize_wired > physmem / 4) {
1450 1450                  /*
1451 1451                   * Choose segpcache_hashsize_wired based on physmem.
1452 1452                   * Create a bucket per 128K bytes upto 256K buckets.
1453 1453                   */
1454 1454                  if (physmegs < 20 * 1024) {
1455 1455                          segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1456 1456                  } else {
1457 1457                          segpcache_hashsize_wired = 256 * 1024;
1458 1458                  }
1459 1459          }
1460 1460          if (!ISP2(segpcache_hashsize_wired)) {
1461 1461                  segpcache_hashsize_wired = 1 <<
1462 1462                      highbit(segpcache_hashsize_wired);
1463 1463          }
1464 1464          seg_phashsize_wired = segpcache_hashsize_wired;
1465 1465          seg_phashtab_wired = kmem_zalloc(
1466 1466              seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1467 1467          for (i = 0; i < seg_phashsize_wired; i++) {
1468 1468                  hp = (struct seg_phash *)&seg_phashtab_wired[i];
1469 1469                  hp->p_hnext = (struct seg_pcache *)hp;
1470 1470                  hp->p_hprev = (struct seg_pcache *)hp;
1471 1471                  mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1472 1472          }
1473 1473  
1474 1474          if (segpcache_maxwindow == 0) {
1475 1475                  if (physmegs < 64) {
1476 1476                          /* 3% of memory */
1477 1477                          segpcache_maxwindow = availrmem >> 5;
1478 1478                  } else if (physmegs < 512) {
1479 1479                          /* 12% of memory */
1480 1480                          segpcache_maxwindow = availrmem >> 3;
1481 1481                  } else if (physmegs < 1024) {
1482 1482                          /* 25% of memory */
1483 1483                          segpcache_maxwindow = availrmem >> 2;
1484 1484                  } else if (physmegs < 2048) {
1485 1485                          /* 50% of memory */
1486 1486                          segpcache_maxwindow = availrmem >> 1;
1487 1487                  } else {
1488 1488                          /* no limit */
1489 1489                          segpcache_maxwindow = (pgcnt_t)-1;
1490 1490                  }
1491 1491          }
1492 1492          seg_pmaxwindow = segpcache_maxwindow;
1493 1493          seg_pinit_mem_config();
1494 1494  }
1495 1495  
1496 1496  /*
1497 1497   * called by pageout if memory is low
1498 1498   */
1499 1499  void
1500 1500  seg_preap(void)
1501 1501  {
1502 1502          /*
1503 1503           * if the cache is off or empty, return
1504 1504           */
1505 1505          if (seg_plocked_window == 0) {
1506 1506                  return;
1507 1507          }
1508 1508          ASSERT(seg_phashsize_win != 0);
1509 1509  
1510 1510          /*
1511 1511           * If somebody is already purging pcache
1512 1512           * just return.
1513 1513           */
1514 1514          if (seg_pdisabled) {
1515 1515                  return;
1516 1516          }
1517 1517  
1518 1518          cv_signal(&seg_pasync_cv);
1519 1519  }
1520 1520  
1521 1521  /*
1522 1522   * run as a backgroud thread and reclaim pagelock
1523 1523   * pages which have not been used recently
1524 1524   */
1525 1525  void
1526 1526  seg_pasync_thread(void)
1527 1527  {
1528 1528          callb_cpr_t cpr_info;
1529 1529  
1530 1530          if (seg_phashsize_win == 0) {
1531 1531                  thread_exit();
1532 1532                  /*NOTREACHED*/
1533 1533          }
1534 1534  
1535 1535          seg_pasync_thr = curthread;
1536 1536  
1537 1537          CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1538 1538              callb_generic_cpr, "seg_pasync");
1539 1539  
1540 1540          if (segpcache_reap_ticks <= 0) {
1541 1541                  segpcache_reap_ticks = segpcache_reap_sec * hz;
1542 1542          }
1543 1543  
1544 1544          mutex_enter(&seg_pasync_mtx);
1545 1545          for (;;) {
1546 1546                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
1547 1547                  (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1548 1548                      segpcache_reap_ticks, TR_CLOCK_TICK);
1549 1549                  CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1550 1550                  if (seg_pdisabled == 0) {
1551 1551                          seg_ppurge_async(0);
1552 1552                  }
1553 1553          }
1554 1554  }
1555 1555  
1556 1556  static struct kmem_cache *seg_cache;
1557 1557  
1558 1558  /*
1559 1559   * Initialize segment management data structures.
1560 1560   */
1561 1561  void
1562 1562  seg_init(void)
1563 1563  {
1564 1564          kstat_t *ksp;
1565 1565  
1566 1566          seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1567 1567              0, NULL, NULL, NULL, NULL, NULL, 0);
1568 1568  
1569 1569          ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1570 1570              segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1571 1571          if (ksp) {
1572 1572                  ksp->ks_data = (void *)segadvstat_ptr;
1573 1573                  kstat_install(ksp);
1574 1574          }
1575 1575  
1576 1576          seg_pinit();
1577 1577  }
1578 1578  
1579 1579  /*
1580 1580   * Allocate a segment to cover [base, base+size]
1581 1581   * and attach it to the specified address space.
1582 1582   */
1583 1583  struct seg *
1584 1584  seg_alloc(struct as *as, caddr_t base, size_t size)
1585 1585  {
1586 1586          struct seg *new;
1587 1587          caddr_t segbase;
1588 1588          size_t segsize;
1589 1589  
1590 1590          segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1591 1591          segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1592 1592              (uintptr_t)segbase;
1593 1593  
1594 1594          if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1595 1595                  return ((struct seg *)NULL);    /* bad virtual addr range */
1596 1596  
1597 1597          if (as != &kas &&
1598 1598              valid_usr_range(segbase, segsize, 0, as,
1599 1599              as->a_userlimit) != RANGE_OKAY)
1600 1600                  return ((struct seg *)NULL);    /* bad virtual addr range */
1601 1601  
1602 1602          new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1603 1603          new->s_ops = NULL;
1604 1604          new->s_data = NULL;
1605 1605          new->s_szc = 0;
1606 1606          new->s_flags = 0;
1607 1607          mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1608 1608          new->s_phead.p_lnext = &new->s_phead;
1609 1609          new->s_phead.p_lprev = &new->s_phead;
1610 1610          if (seg_attach(as, segbase, segsize, new) < 0) {
1611 1611                  kmem_cache_free(seg_cache, new);
1612 1612                  return ((struct seg *)NULL);
1613 1613          }
1614 1614          /* caller must fill in ops, data */
1615 1615          return (new);
1616 1616  }
1617 1617  
1618 1618  /*
1619 1619   * Attach a segment to the address space.  Used by seg_alloc()
1620 1620   * and for kernel startup to attach to static segments.
1621 1621   */
1622 1622  int
1623 1623  seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1624 1624  {
1625 1625          seg->s_as = as;
1626 1626          seg->s_base = base;
1627 1627          seg->s_size = size;
1628 1628  
1629 1629          /*
1630 1630           * as_addseg() will add the segment at the appropraite point
1631 1631           * in the list. It will return -1 if there is overlap with
1632 1632           * an already existing segment.
1633 1633           */
1634 1634          return (as_addseg(as, seg));
1635 1635  }
1636 1636  
1637 1637  /*
1638 1638   * Unmap a segment and free it from its associated address space.
1639 1639   * This should be called by anybody who's finished with a whole segment's
1640 1640   * mapping.  Just calls segop_unmap() on the whole mapping .  It is the
1641 1641   * responsibility of the segment driver to unlink the the segment
1642 1642   * from the address space, and to free public and private data structures
1643 1643   * associated with the segment.  (This is typically done by a call to
1644 1644   * seg_free()).
1645 1645   */
1646 1646  void
1647 1647  seg_unmap(struct seg *seg)
1648 1648  {
1649 1649  #ifdef DEBUG
1650 1650          int ret;
1651 1651  #endif /* DEBUG */
1652 1652  
1653 1653          ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1654 1654  
1655 1655          /* Shouldn't have called seg_unmap if mapping isn't yet established */
1656 1656          ASSERT(seg->s_data != NULL);
1657 1657  
1658 1658          /* Unmap the whole mapping */
1659 1659  #ifdef DEBUG
1660 1660          ret = segop_unmap(seg, seg->s_base, seg->s_size);
1661 1661          ASSERT(ret == 0);
1662 1662  #else
1663 1663          (void) segop_unmap(seg, seg->s_base, seg->s_size);
1664 1664  #endif /* DEBUG */
1665 1665  }
1666 1666  
1667 1667  /*
1668 1668   * Free the segment from its associated as. This should only be called
1669 1669   * if a mapping to the segment has not yet been established (e.g., if
1670 1670   * an error occurs in the middle of doing an as_map when the segment
1671 1671   * has already been partially set up) or if it has already been deleted
1672 1672   * (e.g., from a segment driver unmap routine if the unmap applies to the
1673 1673   * entire segment). If the mapping is currently set up then seg_unmap() should
1674 1674   * be called instead.
1675 1675   */
1676 1676  void
1677 1677  seg_free(struct seg *seg)
1678 1678  {
1679 1679          register struct as *as = seg->s_as;
1680 1680          struct seg *tseg = as_removeseg(as, seg);
1681 1681  
1682 1682          ASSERT(tseg == seg);
1683 1683  
1684 1684          /*
1685 1685           * If the segment private data field is NULL,
1686 1686           * then segment driver is not attached yet.
1687 1687           */
1688 1688          if (seg->s_data != NULL)
1689 1689                  segop_free(seg);
1690 1690  
1691 1691          mutex_destroy(&seg->s_pmtx);
1692 1692          ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1693 1693          ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1694 1694          kmem_cache_free(seg_cache, seg);
1695 1695  }
1696 1696  
1697 1697  /*ARGSUSED*/
1698 1698  static void
1699 1699  seg_p_mem_config_post_add(
1700 1700          void *arg,
1701 1701          pgcnt_t delta_pages)
1702 1702  {
1703 1703          /* Nothing to do. */
1704 1704  }
1705 1705  
1706 1706  void
1707 1707  seg_p_enable(void)
1708 1708  {
1709 1709          mutex_enter(&seg_pcache_mtx);
1710 1710          ASSERT(seg_pdisabled != 0);
1711 1711          seg_pdisabled--;
1712 1712          mutex_exit(&seg_pcache_mtx);
1713 1713  }
1714 1714  
1715 1715  /*
1716 1716   * seg_p_disable - disables seg_pcache, and then attempts to empty the
1717 1717   * cache.
1718 1718   * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1719 1719   * SEGP_FAIL if the cache could not be emptied.
1720 1720   */
1721 1721  int
1722 1722  seg_p_disable(void)
1723 1723  {
1724 1724          pgcnt_t old_plocked;
1725 1725          int stall_count = 0;
1726 1726  
1727 1727          mutex_enter(&seg_pcache_mtx);
1728 1728          seg_pdisabled++;
1729 1729          ASSERT(seg_pdisabled != 0);
1730 1730          mutex_exit(&seg_pcache_mtx);
1731 1731  
1732 1732          /*
1733 1733           * Attempt to empty the cache. Terminate if seg_plocked does not
1734 1734           * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1735 1735           */
1736 1736          while (seg_plocked != 0) {
1737 1737                  ASSERT(seg_phashsize_win != 0);
1738 1738                  old_plocked = seg_plocked;
1739 1739                  seg_ppurge_async(1);
1740 1740                  if (seg_plocked == old_plocked) {
1741 1741                          if (stall_count++ > SEGP_STALL_THRESHOLD) {
1742 1742                                  return (SEGP_FAIL);
1743 1743                          }
1744 1744                  } else
1745 1745                          stall_count = 0;
1746 1746                  if (seg_plocked != 0)
1747 1747                          delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1748 1748          }
1749 1749          return (SEGP_SUCCESS);
1750 1750  }
1751 1751  
1752 1752  /*
1753 1753   * Attempt to purge seg_pcache.  May need to return before this has
1754 1754   * completed to allow other pre_del callbacks to unlock pages. This is
1755 1755   * ok because:
1756 1756   *      1) The seg_pdisabled flag has been set so at least we won't
1757 1757   *      cache anymore locks and the locks we couldn't purge
1758 1758   *      will not be held if they do get released by a subsequent
1759 1759   *      pre-delete callback.
1760 1760   *
1761 1761   *      2) The rest of the memory delete thread processing does not
1762 1762   *      depend on the changes made in this pre-delete callback. No
1763 1763   *      panics will result, the worst that will happen is that the
1764 1764   *      DR code will timeout and cancel the delete.
1765 1765   */
1766 1766  /*ARGSUSED*/
1767 1767  static int
1768 1768  seg_p_mem_config_pre_del(
1769 1769          void *arg,
1770 1770          pgcnt_t delta_pages)
1771 1771  {
1772 1772          if (seg_phashsize_win == 0) {
1773 1773                  return (0);
1774 1774          }
1775 1775          if (seg_p_disable() != SEGP_SUCCESS)
1776 1776                  cmn_err(CE_NOTE,
1777 1777                      "!Pre-delete couldn't purge"" pagelock cache - continuing");
1778 1778          return (0);
1779 1779  }
1780 1780  
1781 1781  /*ARGSUSED*/
1782 1782  static void
1783 1783  seg_p_mem_config_post_del(
1784 1784          void *arg,
1785 1785          pgcnt_t delta_pages,
1786 1786          int cancelled)
1787 1787  {
1788 1788          if (seg_phashsize_win == 0) {
1789 1789                  return;
1790 1790          }
1791 1791          seg_p_enable();
1792 1792  }
1793 1793  
1794 1794  static kphysm_setup_vector_t seg_p_mem_config_vec = {
1795 1795          KPHYSM_SETUP_VECTOR_VERSION,
1796 1796          seg_p_mem_config_post_add,
1797 1797          seg_p_mem_config_pre_del,
1798 1798          seg_p_mem_config_post_del,
1799 1799  };
1800 1800  
1801 1801  static void
1802 1802  seg_pinit_mem_config(void)
1803 1803  {
1804 1804          int ret;
1805 1805  
1806 1806          ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1807 1807          /*
1808 1808           * Want to catch this in the debug kernel. At run time, if the
1809 1809           * callbacks don't get run all will be OK as the disable just makes
1810 1810           * it more likely that the pages can be collected.
1811 1811           */
1812 1812          ASSERT(ret == 0);
1813 1813  }
1814 1814  
1815 1815  /*
1816 1816   * Verify that segment is not a shared anonymous segment which reserves
1817 1817   * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1818 1818   * from one zone to another if any segments are shared.  This is because the
1819 1819   * last process to exit will credit the swap reservation.  This could lead
1820 1820   * to the swap being reserved by one zone, and credited to another.
1821 1821   */
1822 1822  boolean_t
1823 1823  seg_can_change_zones(struct seg *seg)
1824 1824  {
1825 1825          struct segvn_data *svd;
1826 1826  
1827 1827          if (seg->s_ops == &segspt_shmops)
1828 1828                  return (B_FALSE);
1829 1829  
1830 1830          if (seg->s_ops == &segvn_ops) {
1831 1831                  svd = (struct segvn_data *)seg->s_data;
1832 1832                  if (svd->type == MAP_SHARED &&
1833 1833                      svd->amp != NULL &&
1834 1834                      svd->amp->swresv > 0)
1835 1835                  return (B_FALSE);
1836 1836          }
1837 1837          return (B_TRUE);
1838 1838  }
1839 1839  
1840 1840  /*
1841 1841   * Return swap reserved by a segment backing a private mapping.
1842 1842   */
1843 1843  size_t
1844 1844  seg_swresv(struct seg *seg)
1845 1845  {
1846 1846          struct segvn_data *svd;
1847 1847          size_t swap = 0;

↓ open down ↓

1847 lines elided

↑ open up ↑

1848 1848  
1849 1849          if (seg->s_ops == &segvn_ops) {
1850 1850                  svd = (struct segvn_data *)seg->s_data;
1851 1851                  if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1852 1852                          swap = svd->swresv;
1853 1853          }
1854 1854          return (swap);
1855 1855  }
1856 1856  
1857 1857  /*
1858      - * General not supported function for segop_inherit
1859      - */
1860      -/* ARGSUSED */
1861      -int
1862      -seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1863      -{
1864      -        return (ENOTSUP);
1865      -}
1866      -
1867      -/*
1868 1858   * segop wrappers
1869 1859   */
1870 1860  int
1871 1861  segop_dup(struct seg *seg, struct seg *new)
1872 1862  {
1873 1863          return (seg->s_ops->dup(seg, new));
1874 1864  }
1875 1865  
1876 1866  int
1877 1867  segop_unmap(struct seg *seg, caddr_t addr, size_t len)

1878 1868  {
1879 1869          return (seg->s_ops->unmap(seg, addr, len));
1880 1870  }
1881 1871  
1882 1872  void
1883 1873  segop_free(struct seg *seg)
1884 1874  {
1885 1875          seg->s_ops->free(seg);
1886 1876  }
1887 1877  
1888 1878  faultcode_t
1889 1879  segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
1890 1880      enum fault_type type, enum seg_rw rw)
1891 1881  {
1892 1882          return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
1893 1883  }
1894 1884  
1895 1885  faultcode_t
1896 1886  segop_faulta(struct seg *seg, caddr_t addr)
1897 1887  {
1898 1888          return (seg->s_ops->faulta(seg, addr));
1899 1889  }
1900 1890  
1901 1891  int
1902 1892  segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1903 1893  {
1904 1894          return (seg->s_ops->setprot(seg, addr, len, prot));
1905 1895  }
1906 1896  
1907 1897  int
1908 1898  segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1909 1899  {
1910 1900          return (seg->s_ops->checkprot(seg, addr, len, prot));
1911 1901  }
1912 1902  
1913 1903  int
1914 1904  segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
1915 1905  {
1916 1906          return (seg->s_ops->kluster(seg, addr, d));
1917 1907  }
1918 1908  
1919 1909  size_t
1920 1910  segop_swapout(struct seg *seg)
1921 1911  {
1922 1912          return (seg->s_ops->swapout(seg));
1923 1913  }
1924 1914  
1925 1915  int
1926 1916  segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
1927 1917  {
1928 1918          return (seg->s_ops->sync(seg, addr, len, atr, f));
1929 1919  }
1930 1920  
1931 1921  size_t
1932 1922  segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
1933 1923  {
1934 1924          return (seg->s_ops->incore(seg, addr, len, v));
1935 1925  }
1936 1926  
1937 1927  int
1938 1928  segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
1939 1929      ulong_t *b, size_t p)
1940 1930  {
1941 1931          return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
1942 1932  }
1943 1933  
1944 1934  int
1945 1935  segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
1946 1936  {
1947 1937          return (seg->s_ops->getprot(seg, addr, len, p));
1948 1938  }
1949 1939  
1950 1940  u_offset_t
1951 1941  segop_getoffset(struct seg *seg, caddr_t addr)
1952 1942  {
1953 1943          return (seg->s_ops->getoffset(seg, addr));
1954 1944  }
1955 1945  
1956 1946  int
1957 1947  segop_gettype(struct seg *seg, caddr_t addr)
1958 1948  {
1959 1949          return (seg->s_ops->gettype(seg, addr));
1960 1950  }
1961 1951  
1962 1952  int
1963 1953  segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
1964 1954  {
1965 1955          return (seg->s_ops->getvp(seg, addr, vpp));
1966 1956  }
1967 1957  
1968 1958  int
1969 1959  segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
1970 1960  {
1971 1961          return (seg->s_ops->advise(seg, addr, len, b));
1972 1962  }
1973 1963  
1974 1964  void
1975 1965  segop_dump(struct seg *seg)
1976 1966  {
1977 1967          seg->s_ops->dump(seg);
1978 1968  }
1979 1969  
1980 1970  int
1981 1971  segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
1982 1972      enum lock_type type, enum seg_rw rw)
1983 1973  {
1984 1974          return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
1985 1975  }
1986 1976  
1987 1977  int
1988 1978  segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
1989 1979  {
1990 1980          return (seg->s_ops->setpagesize(seg, addr, len, szc));
1991 1981  }
1992 1982  
1993 1983  int
1994 1984  segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
1995 1985  {
1996 1986          return (seg->s_ops->getmemid(seg, addr, mp));
1997 1987  }
1998 1988  
1999 1989  struct lgrp_mem_policy_info *
2000 1990  segop_getpolicy(struct seg *seg, caddr_t addr)
2001 1991  {
2002 1992          if (seg->s_ops->getpolicy == NULL)
2003 1993                  return (NULL);
2004 1994  
2005 1995          return (seg->s_ops->getpolicy(seg, addr));
2006 1996  }
2007 1997  
2008 1998  int
2009 1999  segop_capable(struct seg *seg, segcapability_t cap)
2010 2000  {
2011 2001          return (seg->s_ops->capable(seg, cap));
2012 2002  }
2013 2003  
2014 2004  int
2015 2005  segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
2016 2006  {
2017 2007          if (seg->s_ops->inherit == NULL)
2018 2008                  return (ENOTSUP);
2019 2009  
2020 2010          return (seg->s_ops->inherit(seg, addr, len, op));
2021 2011  }

↓ open down ↓

144 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX