6583 remove whole-process swapping

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26 /*       All Rights Reserved   */
  27 
  28 /*
  29  * University Copyright- Copyright (c) 1982, 1986, 1988
  30  * The Regents of the University of California
  31  * All Rights Reserved
  32  *
  33  * University Acknowledgment- Portions of this document are derived from
  34  * software developed by the University of California, Berkeley, and its
  35  * contributors.
  36  */
  37 
  38 #ifndef _VM_ANON_H
  39 #define _VM_ANON_H
  40 
  41 #include <sys/cred.h>
  42 #include <sys/zone.h>
  43 #include <vm/seg.h>
  44 #include <vm/vpage.h>
  45 
  46 #ifdef  __cplusplus
  47 extern "C" {
  48 #endif
  49 
  50 /*
  51  * VM - Anonymous pages.
  52  */
  53 
  54 typedef unsigned long anoff_t;          /* anon offsets */
  55 
  56 /*
  57  *      Each anonymous page, either in memory or in swap, has an anon structure.
  58  * The structure (slot) provides a level of indirection between anonymous pages
  59  * and their backing store.
  60  *
  61  *      (an_vp, an_off) names the vnode of the anonymous page for this slot.
  62  *
  63  *      (an_pvp, an_poff) names the location of the physical backing store
  64  *      for the page this slot represents. If the name is null there is no
  65  *      associated physical store. The physical backing store location can
  66  *      change while the slot is in use.
  67  *
  68  *      an_hash is a hash list of anon slots. The list is hashed by
  69  *      (an_vp, an_off) of the associated anonymous page and provides a
  70  *      method of going from the name of an anonymous page to its
  71  *      associated anon slot.
  72  *
  73  *      an_refcnt holds a reference count which is the number of separate
  74  *      copies that will need to be created in case of copy-on-write.
  75  *      A refcnt > 0 protects the existence of the slot. The refcnt is
  76  *      initialized to 1 when the anon slot is created in anon_alloc().
  77  *      If a client obtains an anon slot and allows multiple threads to
  78  *      share it, then it is the client's responsibility to insure that
  79  *      it does not allow one thread to try to reference the slot at the
  80  *      same time as another is trying to decrement the last count and
  81  *      destroy the anon slot. E.g., the seg_vn segment type protects
  82  *      against this with higher level locks.
  83  */
  84 
  85 struct anon {
  86         struct vnode *an_vp;    /* vnode of anon page */
  87         struct vnode *an_pvp;   /* vnode of physical backing store */
  88         anoff_t an_off;         /* offset of anon page */
  89         anoff_t an_poff;        /* offset in vnode */
  90         struct anon *an_hash;   /* hash table of anon slots */
  91         int an_refcnt;          /* # of people sharing slot */
  92 };
  93 
  94 #define AN_CACHE_ALIGN_LOG2     4       /* log2(AN_CACHE_ALIGN) */
  95 #define AN_CACHE_ALIGN  (1U << AN_CACHE_ALIGN_LOG2) /* anon address aligned */
  96                                                 /* 16 bytes */
  97 
  98 
  99 #ifdef _KERNEL
 100 /*
 101  * The swapinfo_lock protects:
 102  *              swapinfo list
 103  *              individual swapinfo structures
 104  *
 105  * The anoninfo_lock protects:
 106  *              anoninfo counters
 107  *
 108  * The anonhash_lock protects:
 109  *              anon hash lists
 110  *              anon slot fields
 111  *
 112  * Fields in the anon slot which are read-only for the life of the slot
 113  * (an_vp, an_off) do not require the anonhash_lock be held to access them.
 114  * If you access a field without the anonhash_lock held you must be holding
 115  * the slot with an_refcnt to make sure it isn't destroyed.
 116  * To write (an_pvp, an_poff) in a given slot you must also hold the
 117  * p_iolock of the anonymous page for slot.
 118  */
 119 extern kmutex_t anoninfo_lock;
 120 extern kmutex_t swapinfo_lock;
 121 extern pad_mutex_t *anonhash_lock;
 122 extern pad_mutex_t anon_array_lock[];
 123 extern kcondvar_t anon_array_cv[];
 124 
 125 /*
 126  * Global hash table to provide a function from (vp, off) -> ap
 127  */
 128 extern size_t anon_hash_size;
 129 extern unsigned int anon_hash_shift;
 130 extern struct anon **anon_hash;
 131 #define ANON_HASH_SIZE  anon_hash_size
 132 #define ANON_HASHAVELEN 4
 133 /*
 134  * Try to use as many bits of randomness from both vp and off as we can.
 135  * This should help spreading evenly for a variety of workloads.  See comments
 136  * for PAGE_HASH_FUNC for more explanation.
 137  */
 138 #define ANON_HASH(vp, off)      \
 139         (((((uintptr_t)(off) >> PAGESHIFT) ^ \
 140                 ((uintptr_t)(off) >> (PAGESHIFT + anon_hash_shift))) ^ \
 141                 (((uintptr_t)(vp) >> 3) ^ \
 142                 ((uintptr_t)(vp) >> (3 + anon_hash_shift)) ^ \
 143                 ((uintptr_t)(vp) >> (3 + 2 * anon_hash_shift)) ^ \
 144                 ((uintptr_t)(vp) << \
 145                     (anon_hash_shift - AN_VPSHIFT - VNODE_ALIGN_LOG2)))) & \
 146                 (anon_hash_size - 1))
 147 
 148 #define AH_LOCK_SIZE    (2 << NCPU_LOG2)
 149 
 150 #define AH_MUTEX(vp, off)                               \
 151         (&anonhash_lock[(ANON_HASH((vp), (off)) &       \
 152             (AH_LOCK_SIZE - 1))].pad_mutex)
 153 
 154 #endif  /* _KERNEL */
 155 
 156 /*
 157  * Declaration for the Global counters to accurately
 158  * track the kernel foot print in memory.
 159  */
 160 extern  pgcnt_t pages_locked;
 161 extern  pgcnt_t pages_claimed;
 162 extern  pgcnt_t pages_useclaim;
 163 extern  pgcnt_t obp_pages;
 164 
 165 /*
 166  * Anonymous backing store accounting structure for swapctl.
 167  *
 168  * ani_max = maximum amount of swap space
 169  *      (including potentially available physical memory)
 170  * ani_free = amount of unallocated anonymous memory
 171  *      (some of which might be reserved and including
 172  *      potentially available physical memory)
 173  * ani_resv = amount of claimed (reserved) anonymous memory
 174  *
 175  * The swap data can be aquired more efficiently through the
 176  * kstats interface.
 177  * Total slots currently available for reservation =
 178  *      MAX(ani_max - ani_resv, 0) + (availrmem - swapfs_minfree)
 179  */
 180 struct anoninfo {
 181         pgcnt_t ani_max;
 182         pgcnt_t ani_free;
 183         pgcnt_t ani_resv;
 184 };
 185 
 186 #ifdef _SYSCALL32
 187 struct anoninfo32 {
 188         size32_t ani_max;
 189         size32_t ani_free;
 190         size32_t ani_resv;
 191 };
 192 #endif /* _SYSCALL32 */
 193 
 194 /*
 195  * Define the NCPU pool of the ani_free counters. Update the counter
 196  * of the cpu on which the thread is running and in every clock intr
 197  * sync anoninfo.ani_free with the current total off all the NCPU entries.
 198  */
 199 
 200 typedef struct  ani_free {
 201         pgcnt_t         ani_count;
 202         uchar_t         pad[64 - sizeof (pgcnt_t)];
 203                         /* XXX 64 = cacheline size */
 204 } ani_free_t;
 205 
 206 #define ANI_MAX_POOL    (NCPU_P2)
 207 extern  ani_free_t      *ani_free_pool;
 208 
 209 /*
 210  * Since each CPU has its own bucket in ani_free_pool, there should be no
 211  * contention here.
 212  */
 213 #define ANI_ADD(inc)    { \
 214         pgcnt_t *ani_countp; \
 215         int     index; \
 216         index = (CPU->cpu_seqid & (ANI_MAX_POOL - 1)); \
 217         ani_countp = &ani_free_pool[index].ani_count; \
 218         atomic_add_long(ani_countp, inc); \
 219 }
 220 
 221 extern void     set_anoninfo(void);
 222 
 223 /*
 224  * Anon array pointers are allocated in chunks. Each chunk
 225  * has PAGESIZE/sizeof(u_long *) of anon pointers.
 226  * There are two levels of arrays for anon array pointers larger
 227  * than a chunk. The first level points to anon array chunks.
 228  * The second level consists of chunks of anon pointers.
 229  *
 230  * If anon array is smaller than a chunk then the whole anon array
 231  * is created (memory is allocated for whole anon array).
 232  * If anon array is larger than a chunk only first level array is
 233  * allocated. Then other arrays (chunks) are allocated only when
 234  * they are initialized with anon pointers.
 235  */
 236 struct anon_hdr {
 237         kmutex_t serial_lock;   /* serialize array chunk allocation */
 238         pgcnt_t size;           /* number of pointers to (anon) pages */
 239         void    **array_chunk;  /* pointers to anon pointers or chunks of */
 240                                 /* anon pointers */
 241         int     flags;          /* ANON_ALLOC_FORCE force preallocation of */
 242                                 /* whole anon array     */
 243 };
 244 
 245 #ifdef  _LP64
 246 #define ANON_PTRSHIFT   3
 247 #define ANON_PTRMASK    ~7
 248 #else
 249 #define ANON_PTRSHIFT   2
 250 #define ANON_PTRMASK    ~3
 251 #endif
 252 
 253 #define ANON_CHUNK_SIZE         (PAGESIZE >> ANON_PTRSHIFT)
 254 #define ANON_CHUNK_SHIFT        (PAGESHIFT - ANON_PTRSHIFT)
 255 #define ANON_CHUNK_OFF          (ANON_CHUNK_SIZE - 1)
 256 
 257 /*
 258  * Anon flags.
 259  */
 260 #define ANON_SLEEP              0x0     /* ok to block */
 261 #define ANON_NOSLEEP            0x1     /* non-blocking call */
 262 #define ANON_ALLOC_FORCE        0x2     /* force single level anon array */
 263 #define ANON_GROWDOWN           0x4     /* anon array should grow downward */
 264 
 265 struct kshmid;
 266 
 267 /*
 268  * The anon_map structure is used by various clients of the anon layer to
 269  * manage anonymous memory.   When anonymous memory is shared,
 270  * then the different clients sharing it will point to the
 271  * same anon_map structure.  Also, if a segment is unmapped
 272  * in the middle where an anon_map structure exists, the
 273  * newly created segment will also share the anon_map structure,
 274  * although the two segments will use different ranges of the
 275  * anon array.  When mappings are private (or shared with
 276  * a reference count of 1), an unmap operation will free up
 277  * a range of anon slots in the array given by the anon_map
 278  * structure.  Because of fragmentation due to this unmapping,
 279  * we have to store the size of the anon array in the anon_map
 280  * structure so that we can free everything when the referernce
 281  * count goes to zero.
 282  *
 283  * A new rangelock scheme is introduced to make the anon layer scale.
 284  * A reader/writer lock per anon_amp and an array of system-wide hash
 285  * locks, anon_array_lock[] are introduced to replace serial_lock and
 286  * anonmap lock.  The writer lock is held when we want to singlethreaD
 287  * the reference to the anon array pointers or when references to
 288  * anon_map's members, whereas reader lock and anon_array_lock are
 289  * held to allows multiple threads to reference different part of
 290  * anon array.  A global set of condition variables, anon_array_cv,
 291  * are used with anon_array_lock[] to make the hold time of the locks
 292  * short.
 293  *
 294  * szc is used to calculate the index of hash locks and cv's.  We
 295  * could've just used seg->s_szc if not for the possible sharing of
 296  * anon_amp between SYSV shared memory and ISM, so now we introduce
 297  * szc in the anon_map structure.  For MAP_SHARED, the amp->szc is either
 298  * 0 (base page size) or page_num_pagesizes() - 1, while MAP_PRIVATE
 299  * the amp->szc could be anything in [0, page_num_pagesizes() - 1].
 300  */
 301 typedef struct anon_map {
 302         krwlock_t a_rwlock;     /* protect anon_map and anon array */
 303         size_t  size;           /* size in bytes mapped by the anon array */
 304         struct  anon_hdr *ahp;  /* anon array header pointer, containing */
 305                                 /* anon pointer array(s) */
 306         size_t  swresv;         /* swap space reserved for this anon_map */
 307         ulong_t refcnt;         /* reference count on this structure */
 308         ushort_t a_szc;         /* max szc among shared processes */
 309         void    *locality;      /* lgroup locality info */
 310         struct kshmid *a_sp;    /* kshmid if amp backs sysV, or NULL */
 311         int     a_purgewait;    /* somebody waits for slocks to go away */
 312         kcondvar_t a_purgecv;   /* cv for waiting for slocks to go away */
 313         kmutex_t a_purgemtx;    /* mutex for anonmap_purge() */
 314         spgcnt_t a_softlockcnt; /* number of pages locked in pcache */
 315         kmutex_t a_pmtx;        /* protects amp's pcache list */
 316         pcache_link_t a_phead;  /* head of amp's pcache list */
 317 } amp_t;
 318 
 319 #ifdef _KERNEL
 320 
 321 #define ANON_BUSY               0x1
 322 #define ANON_ISBUSY(slot)       (*(slot) & ANON_BUSY)
 323 #define ANON_SETBUSY(slot)      (*(slot) |= ANON_BUSY)
 324 #define ANON_CLRBUSY(slot)      (*(slot) &= ~ANON_BUSY)
 325 
 326 #define ANON_MAP_SHIFT          6       /* log2(sizeof (struct anon_map)) */
 327 #define ANON_ARRAY_SHIFT        7       /* log2(ANON_LOCKSIZE) */
 328 #define ANON_LOCKSIZE           128
 329 
 330 #define ANON_LOCK_ENTER(lock, type)     rw_enter((lock), (type))
 331 #define ANON_LOCK_EXIT(lock)            rw_exit((lock))
 332 #define ANON_LOCK_HELD(lock)            RW_LOCK_HELD((lock))
 333 #define ANON_READ_HELD(lock)            RW_READ_HELD((lock))
 334 #define ANON_WRITE_HELD(lock)           RW_WRITE_HELD((lock))
 335 
 336 #define ANON_ARRAY_HASH(amp, idx)\
 337         ((((idx) + ((idx) >> ANON_ARRAY_SHIFT) +\
 338         ((idx) >> (ANON_ARRAY_SHIFT << 1)) +\
 339         ((idx) >> (ANON_ARRAY_SHIFT + (ANON_ARRAY_SHIFT << 1)))) ^\
 340         ((uintptr_t)(amp) >> ANON_MAP_SHIFT)) & (ANON_LOCKSIZE - 1))
 341 
 342 typedef struct anon_sync_obj {
 343         kmutex_t        *sync_mutex;
 344         kcondvar_t      *sync_cv;
 345         ulong_t         *sync_data;
 346 } anon_sync_obj_t;
 347 
 348 /*
 349  * Anonymous backing store accounting structure for kernel.
 350  * ani_max = total reservable slots on physical (disk-backed) swap
 351  * ani_phys_resv = total phys slots reserved for use by clients
 352  * ani_mem_resv = total mem slots reserved for use by clients
 353  * ani_free = # unallocated physical slots + # of reserved unallocated
 354  * memory slots
 355  */
 356 
 357 /*
 358  * Initial total swap slots available for reservation
 359  */
 360 #define TOTAL_AVAILABLE_SWAP \
 361         (k_anoninfo.ani_max + MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
 362 
 363 /*
 364  * Swap slots currently available for reservation
 365  */
 366 #define CURRENT_TOTAL_AVAILABLE_SWAP                            \
 367         ((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +      \
 368             MAX((spgcnt_t)(availrmem - swapfs_minfree), 0))
 369 
 370 struct k_anoninfo {
 371         pgcnt_t ani_max;        /* total reservable slots on phys */
 372                                         /* (disk) swap */
 373         pgcnt_t ani_free;       /* # of unallocated phys and mem slots */
 374         pgcnt_t ani_phys_resv;  /* # of reserved phys (disk) slots */
 375         pgcnt_t ani_mem_resv;   /* # of reserved mem slots */
 376         pgcnt_t ani_locked_swap; /* # of swap slots locked in reserved */
 377                                 /* mem swap */
 378 };
 379 
 380 extern  struct k_anoninfo k_anoninfo;
 381 
 382 extern void     anon_init(void);
 383 extern struct   anon *anon_alloc(struct vnode *, anoff_t);
 384 extern void     anon_dup(struct anon_hdr *, ulong_t,
 385                     struct anon_hdr *, ulong_t, size_t);
 386 extern void     anon_dup_fill_holes(struct anon_hdr *, ulong_t,
 387                     struct anon_hdr *, ulong_t, size_t, uint_t, int);
 388 extern int      anon_fill_cow_holes(struct seg *, caddr_t, struct anon_hdr *,
 389                     ulong_t, struct vnode *, u_offset_t, size_t, uint_t,
 390                     uint_t, struct vpage [], struct cred *);
 391 extern void     anon_free(struct anon_hdr *, ulong_t, size_t);
 392 extern void     anon_free_pages(struct anon_hdr *, ulong_t, size_t, uint_t);
 393 extern void     anon_disclaim(struct anon_map *, ulong_t, size_t);
 394 extern int      anon_getpage(struct anon **, uint_t *, struct page **,
 395                     size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
 396 extern int      swap_getconpage(struct vnode *, u_offset_t, size_t,
 397                     uint_t *, page_t *[], size_t, page_t *, uint_t *,
 398                     spgcnt_t *, struct seg *, caddr_t,
 399                     enum seg_rw, struct cred *);
 400 extern int      anon_map_getpages(struct anon_map *, ulong_t,
 401                     uint_t, struct seg *, caddr_t, uint_t,
 402                     uint_t *, page_t *[], uint_t *,
 403                     struct vpage [], enum seg_rw, int, int, int, struct cred *);
 404 extern int      anon_map_privatepages(struct anon_map *, ulong_t,
 405                     uint_t, struct seg *, caddr_t, uint_t,
 406                     page_t *[], struct vpage [], int, int, struct cred *);
 407 extern struct   page *anon_private(struct anon **, struct seg *,
 408                     caddr_t, uint_t, struct page *,
 409                     int, struct cred *);
 410 extern struct   page *anon_zero(struct seg *, caddr_t,
 411                     struct anon **, struct cred *);
 412 extern int      anon_map_createpages(struct anon_map *, ulong_t,
 413                     size_t, struct page **,
 414                     struct seg *, caddr_t,
 415                     enum seg_rw, struct cred *);
 416 extern int      anon_map_demotepages(struct anon_map *, ulong_t,
 417                     struct seg *, caddr_t, uint_t,
 418                     struct vpage [], struct cred *);
 419 extern void     anon_shmap_free_pages(struct anon_map *, ulong_t, size_t);
 420 extern int      anon_resvmem(size_t, boolean_t, zone_t *, int);
 421 extern void     anon_unresvmem(size_t, zone_t *);
 422 extern struct   anon_map *anonmap_alloc(size_t, size_t, int);
 423 extern void     anonmap_free(struct anon_map *);
 424 extern void     anonmap_purge(struct anon_map *);
 425 extern void     anon_swap_free(struct anon *, struct page *);
 426 extern void     anon_decref(struct anon *);
 427 extern int      non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
 428 extern pgcnt_t  anon_pages(struct anon_hdr *, ulong_t, pgcnt_t);
 429 extern int      anon_swap_adjust(pgcnt_t);
 430 extern void     anon_swap_restore(pgcnt_t);
 431 extern struct   anon_hdr *anon_create(pgcnt_t, int);
 432 extern void     anon_release(struct anon_hdr *, pgcnt_t);
 433 extern struct   anon *anon_get_ptr(struct anon_hdr *, ulong_t);
 434 extern ulong_t  *anon_get_slot(struct anon_hdr *, ulong_t);
 435 extern struct   anon *anon_get_next_ptr(struct anon_hdr *, ulong_t *);
 436 extern int      anon_set_ptr(struct anon_hdr *, ulong_t, struct anon *, int);
 437 extern int      anon_copy_ptr(struct anon_hdr *, ulong_t,
 438                     struct anon_hdr *, ulong_t, pgcnt_t, int);
 439 extern pgcnt_t  anon_grow(struct anon_hdr *, ulong_t *, pgcnt_t, pgcnt_t, int);
 440 extern void     anon_array_enter(struct anon_map *, ulong_t,
 441                         anon_sync_obj_t *);
 442 extern int      anon_array_try_enter(struct anon_map *, ulong_t,
 443                         anon_sync_obj_t *);
 444 extern void     anon_array_exit(anon_sync_obj_t *);
 445 
 446 /*
 447  * anon_resv checks to see if there is enough swap space to fulfill a
 448  * request and if so, reserves the appropriate anonymous memory resources.
 449  * anon_checkspace just checks to see if there is space to fulfill the request,
 450  * without taking any resources.  Both return 1 if successful and 0 if not.
 451  *
 452  * Macros are provided as anon reservation is usually charged to the zone of
 453  * the current process.  In some cases (such as anon reserved by tmpfs), a
 454  * zone pointer is needed to charge the appropriate zone.
 455  */
 456 #define anon_unresv(size)               anon_unresvmem(size, curproc->p_zone)
 457 #define anon_unresv_zone(size, zone)    anon_unresvmem(size, zone)
 458 #define anon_resv(size)                 \
 459         anon_resvmem((size), 1, curproc->p_zone, 1)
 460 #define anon_resv_zone(size, zone)      anon_resvmem((size), 1, zone, 1)
 461 #define anon_checkspace(size, zone)     anon_resvmem((size), 0, zone, 0)
 462 #define anon_try_resv_zone(size, zone)  anon_resvmem((size), 1, zone, 0)
 463 
 464 /*
 465  * Flags to anon_private
 466  */
 467 #define STEAL_PAGE      0x1     /* page can be stolen */
 468 #define LOCK_PAGE       0x2     /* page must be ``logically'' locked */
 469 
 470 /*
 471  * SEGKP ANON pages that are locked are assumed to be LWP stack pages
 472  * and thus count towards the user pages locked count.
 473  * This value is protected by the same lock as availrmem.
 474  */
 475 extern pgcnt_t anon_segkp_pages_locked;
 476 
 477 extern int anon_debug;
 478 
 479 #ifdef ANON_DEBUG
 480 
 481 #define A_ANON  0x01
 482 #define A_RESV  0x02
 483 #define A_MRESV 0x04
 484 
 485 /* vararg-like debugging macro. */
 486 #define ANON_PRINT(f, printf_args) \
 487                 if (anon_debug & f) \
 488                         printf printf_args
 489 
 490 #else   /* ANON_DEBUG */
 491 
 492 #define ANON_PRINT(f, printf_args)
 493 
 494 #endif  /* ANON_DEBUG */
 495 
 496 #endif  /* _KERNEL */
 497 
 498 #ifdef  __cplusplus
 499 }
 500 #endif
 501 
 502 #endif  /* _VM_ANON_H */
--- EOF ---