1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  26 /*        All Rights Reserved   */
  27 
  28 /*
  29  * University Copyright- Copyright (c) 1982, 1986, 1988
  30  * The Regents of the University of California
  31  * All Rights Reserved
  32  *
  33  * University Acknowledgment- Portions of this document are derived from
  34  * software developed by the University of California, Berkeley, and its
  35  * contributors.
  36  */
  37 
  38 /*
  39  * VM - paged vnode.
  40  *
  41  * This file supplies vm support for the vnode operations that deal with pages.
  42  */
  43 #include <sys/types.h>
  44 #include <sys/t_lock.h>
  45 #include <sys/param.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/systm.h>
  48 #include <sys/time.h>
  49 #include <sys/buf.h>
  50 #include <sys/vnode.h>
  51 #include <sys/uio.h>
  52 #include <sys/vmsystm.h>
  53 #include <sys/mman.h>
  54 #include <sys/vfs.h>
  55 #include <sys/cred.h>
  56 #include <sys/user.h>
  57 #include <sys/kmem.h>
  58 #include <sys/cmn_err.h>
  59 #include <sys/debug.h>
  60 #include <sys/cpuvar.h>
  61 #include <sys/vtrace.h>
  62 #include <sys/tnf_probe.h>
  63 
  64 #include <vm/hat.h>
  65 #include <vm/as.h>
  66 #include <vm/seg.h>
  67 #include <vm/rm.h>
  68 #include <vm/pvn.h>
  69 #include <vm/page.h>
  70 #include <vm/seg_map.h>
  71 #include <vm/seg_kmem.h>
  72 #include <sys/fs/swapnode.h>
  73 
  74 int pvn_nofodklust = 0;
  75 int pvn_write_noklust = 0;
  76 
  77 uint_t pvn_vmodsort_supported = 0;      /* set if HAT supports VMODSORT */
  78 uint_t pvn_vmodsort_disable = 0;        /* set in /etc/system to disable HAT */
  79                                         /* support for vmodsort for testing */
  80 
  81 static struct kmem_cache *marker_cache = NULL;
  82 
  83 /*
  84  * Find the largest contiguous block which contains `addr' for file offset
  85  * `offset' in it while living within the file system block sizes (`vp_off'
  86  * and `vp_len') and the address space limits for which no pages currently
  87  * exist and which map to consecutive file offsets.
  88  */
  89 page_t *
  90 pvn_read_kluster(
  91         struct vnode *vp,
  92         u_offset_t off,
  93         struct seg *seg,
  94         caddr_t addr,
  95         u_offset_t *offp,                       /* return values */
  96         size_t *lenp,                           /* return values */
  97         u_offset_t vp_off,
  98         size_t vp_len,
  99         int isra)
 100 {
 101         ssize_t deltaf, deltab;
 102         page_t *pp;
 103         page_t *plist = NULL;
 104         spgcnt_t pagesavail;
 105         u_offset_t vp_end;
 106 
 107         ASSERT(off >= vp_off && off < vp_off + vp_len);
 108 
 109         /*
 110          * We only want to do klustering/read ahead if there
 111          * is more than minfree pages currently available.
 112          */
 113         pagesavail = freemem - minfree;
 114 
 115         if (pagesavail <= 0)
 116                 if (isra)
 117                         return ((page_t *)NULL);    /* ra case - give up */
 118                 else
 119                         pagesavail = 1;             /* must return a page */
 120 
 121         /* We calculate in pages instead of bytes due to 32-bit overflows */
 122         if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 123                 /*
 124                  * Don't have enough free memory for the
 125                  * max request, try sizing down vp request.
 126                  */
 127                 deltab = (ssize_t)(off - vp_off);
 128                 vp_len -= deltab;
 129                 vp_off += deltab;
 130                 if (pagesavail < btopr(vp_len)) {
 131                         /*
 132                          * Still not enough memory, just settle for
 133                          * pagesavail which is at least 1.
 134                          */
 135                         vp_len = ptob(pagesavail);
 136                 }
 137         }
 138 
 139         vp_end = vp_off + vp_len;
 140         ASSERT(off >= vp_off && off < vp_end);
 141 
 142         if (isra && SEGOP_KLUSTER(seg, addr, 0))
 143                 return ((page_t *)NULL);        /* segment driver says no */
 144 
 145         if ((plist = page_create_va(vp, off,
 146             PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 147                 return ((page_t *)NULL);
 148 
 149         if (vp_len <= PAGESIZE || pvn_nofodklust) {
 150                 *offp = off;
 151                 *lenp = MIN(vp_len, PAGESIZE);
 152         } else {
 153                 /*
 154                  * Scan back from front by incrementing "deltab" and
 155                  * comparing "off" with "vp_off + deltab" to avoid
 156                  * "signed" versus "unsigned" conversion problems.
 157                  */
 158                 for (deltab = PAGESIZE; off >= vp_off + deltab;
 159                     deltab += PAGESIZE) {
 160                         /*
 161                          * Call back to the segment driver to verify that
 162                          * the klustering/read ahead operation makes sense.
 163                          */
 164                         if (SEGOP_KLUSTER(seg, addr, -deltab))
 165                                 break;          /* page not eligible */
 166                         if ((pp = page_create_va(vp, off - deltab,
 167                             PAGESIZE, PG_EXCL, seg, addr - deltab))
 168                             == NULL)
 169                                 break;          /* already have the page */
 170                         /*
 171                          * Add page to front of page list.
 172                          */
 173                         page_add(&plist, pp);
 174                 }
 175                 deltab -= PAGESIZE;
 176 
 177                 /* scan forward from front */
 178                 for (deltaf = PAGESIZE; off + deltaf < vp_end;
 179                     deltaf += PAGESIZE) {
 180                         /*
 181                          * Call back to the segment driver to verify that
 182                          * the klustering/read ahead operation makes sense.
 183                          */
 184                         if (SEGOP_KLUSTER(seg, addr, deltaf))
 185                                 break;          /* page not file extension */
 186                         if ((pp = page_create_va(vp, off + deltaf,
 187                             PAGESIZE, PG_EXCL, seg, addr + deltaf))
 188                             == NULL)
 189                                 break;          /* already have page */
 190 
 191                         /*
 192                          * Add page to end of page list.
 193                          */
 194                         page_add(&plist, pp);
 195                         plist = plist->p_next;
 196                 }
 197                 *offp = off = off - deltab;
 198                 *lenp = deltab + deltaf;
 199                 ASSERT(off >= vp_off);
 200 
 201                 /*
 202                  * If we ended up getting more than was actually
 203                  * requested, retract the returned length to only
 204                  * reflect what was requested.  This might happen
 205                  * if we were allowed to kluster pages across a
 206                  * span of (say) 5 frags, and frag size is less
 207                  * than PAGESIZE.  We need a whole number of
 208                  * pages to contain those frags, but the returned
 209                  * size should only allow the returned range to
 210                  * extend as far as the end of the frags.
 211                  */
 212                 if ((vp_off + vp_len) < (off + *lenp)) {
 213                         ASSERT(vp_end > off);
 214                         *lenp = vp_end - off;
 215                 }
 216         }
 217         TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
 218             "pvn_read_kluster:seg %p addr %x isra %x",
 219             seg, addr, isra);
 220         return (plist);
 221 }
 222 
 223 /*
 224  * Handle pages for this vnode on either side of the page "pp"
 225  * which has been locked by the caller.  This routine will also
 226  * do klustering in the range [vp_off, vp_off + vp_len] up
 227  * until a page which is not found.  The offset and length
 228  * of pages included is returned in "*offp" and "*lenp".
 229  *
 230  * Returns a list of dirty locked pages all ready to be
 231  * written back.
 232  */
 233 page_t *
 234 pvn_write_kluster(
 235         struct vnode *vp,
 236         page_t *pp,
 237         u_offset_t *offp,               /* return values */
 238         size_t *lenp,                   /* return values */
 239         u_offset_t vp_off,
 240         size_t vp_len,
 241         int flags)
 242 {
 243         u_offset_t off;
 244         page_t *dirty;
 245         size_t deltab, deltaf;
 246         se_t se;
 247         u_offset_t vp_end;
 248 
 249         off = pp->p_offset;
 250 
 251         /*
 252          * Kustering should not be done if we are invalidating
 253          * pages since we could destroy pages that belong to
 254          * some other process if this is a swap vnode.
 255          */
 256         if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 257                 *offp = off;
 258                 *lenp = PAGESIZE;
 259                 return (pp);
 260         }
 261 
 262         if (flags & (B_FREE | B_INVAL))
 263                 se = SE_EXCL;
 264         else
 265                 se = SE_SHARED;
 266 
 267         dirty = pp;
 268         /*
 269          * Scan backwards looking for pages to kluster by incrementing
 270          * "deltab" and comparing "off" with "vp_off + deltab" to
 271          * avoid "signed" versus "unsigned" conversion problems.
 272          */
 273         for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 274                 pp = page_lookup_nowait(vp, off - deltab, se);
 275                 if (pp == NULL)
 276                         break;          /* page not found */
 277                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 278                         break;
 279                 page_add(&dirty, pp);
 280         }
 281         deltab -= PAGESIZE;
 282 
 283         vp_end = vp_off + vp_len;
 284         /* now scan forwards looking for pages to kluster */
 285         for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 286                 pp = page_lookup_nowait(vp, off + deltaf, se);
 287                 if (pp == NULL)
 288                         break;          /* page not found */
 289                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 290                         break;
 291                 page_add(&dirty, pp);
 292                 dirty = dirty->p_next;
 293         }
 294 
 295         *offp = off - deltab;
 296         *lenp = deltab + deltaf;
 297         return (dirty);
 298 }
 299 
 300 /*
 301  * Generic entry point used to release the "shared/exclusive" lock
 302  * and the "p_iolock" on pages after i/o is complete.
 303  */
 304 void
 305 pvn_io_done(page_t *plist)
 306 {
 307         page_t *pp;
 308 
 309         while (plist != NULL) {
 310                 pp = plist;
 311                 page_sub(&plist, pp);
 312                 page_io_unlock(pp);
 313                 page_unlock(pp);
 314         }
 315 }
 316 
 317 /*
 318  * Entry point to be used by file system getpage subr's and
 319  * other such routines which either want to unlock pages (B_ASYNC
 320  * request) or destroy a list of pages if an error occurred.
 321  */
 322 void
 323 pvn_read_done(page_t *plist, int flags)
 324 {
 325         page_t *pp;
 326 
 327         while (plist != NULL) {
 328                 pp = plist;
 329                 page_sub(&plist, pp);
 330                 page_io_unlock(pp);
 331                 if (flags & B_ERROR) {
 332                         /*LINTED: constant in conditional context*/
 333                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 334                 } else {
 335                         (void) page_release(pp, 0);
 336                 }
 337         }
 338 }
 339 
 340 /*
 341  * Automagic pageout.
 342  * When memory gets tight, start freeing pages popping out of the
 343  * write queue.
 344  */
 345 int     write_free = 1;
 346 pgcnt_t pages_before_pager = 200;       /* LMXXX */
 347 
 348 /*
 349  * Routine to be called when page-out's complete.
 350  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
 351  * after waiting for i/o to complete (biowait) to free the list of
 352  * pages associated with the buffer.  These pages must be locked
 353  * before i/o is initiated.
 354  *
 355  * If a write error occurs, the pages are marked as modified
 356  * so the write will be re-tried later.
 357  */
 358 
 359 void
 360 pvn_write_done(page_t *plist, int flags)
 361 {
 362         int dfree = 0;
 363         int pgrec = 0;
 364         int pgout = 0;
 365         int pgpgout = 0;
 366         int anonpgout = 0;
 367         int anonfree = 0;
 368         int fspgout = 0;
 369         int fsfree = 0;
 370         int execpgout = 0;
 371         int execfree = 0;
 372         page_t *pp;
 373         struct cpu *cpup;
 374         struct vnode *vp = NULL;        /* for probe */
 375         uint_t ppattr;
 376         kmutex_t *vphm = NULL;
 377 
 378         ASSERT((flags & B_READ) == 0);
 379 
 380         /*
 381          * If we are about to start paging anyway, start freeing pages.
 382          */
 383         if (write_free && freemem < lotsfree + pages_before_pager &&
 384             (flags & B_ERROR) == 0) {
 385                 flags |= B_FREE;
 386         }
 387 
 388         /*
 389          * Handle each page involved in the i/o operation.
 390          */
 391         while (plist != NULL) {
 392                 pp = plist;
 393                 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 394                 page_sub(&plist, pp);
 395 
 396                 /* Kernel probe support */
 397                 if (vp == NULL)
 398                         vp = pp->p_vnode;
 399 
 400                 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 401                         /*
 402                          * Move page to the top of the v_page list.
 403                          * Skip pages modified during IO.
 404                          */
 405                         vphm = page_vnode_mutex(vp);
 406                         mutex_enter(vphm);
 407                         if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
 408                                 page_vpsub(&vp->v_pages, pp);
 409                                 page_vpadd(&vp->v_pages, pp);
 410                         }
 411                         mutex_exit(vphm);
 412                 }
 413 
 414                 if (flags & B_ERROR) {
 415                         /*
 416                          * Write operation failed.  We don't want
 417                          * to destroy (or free) the page unless B_FORCE
 418                          * is set. We set the mod bit again and release
 419                          * all locks on the page so that it will get written
 420                          * back again later when things are hopefully
 421                          * better again.
 422                          * If B_INVAL and B_FORCE is set we really have
 423                          * to destroy the page.
 424                          */
 425                         if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 426                                 page_io_unlock(pp);
 427                                 /*LINTED: constant in conditional context*/
 428                                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
 429                         } else {
 430                                 hat_setmod_only(pp);
 431                                 page_io_unlock(pp);
 432                                 page_unlock(pp);
 433                         }
 434                 } else if (flags & B_INVAL) {
 435                         /*
 436                          * XXX - Failed writes with B_INVAL set are
 437                          * not handled appropriately.
 438                          */
 439                         page_io_unlock(pp);
 440                         /*LINTED: constant in conditional context*/
 441                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 442                 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 443                         /*
 444                          * Update statistics for pages being paged out
 445                          */
 446                         if (pp->p_vnode) {
 447                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 448                                         anonpgout++;
 449                                 } else {
 450                                         if (pp->p_vnode->v_flag & VVMEXEC) {
 451                                                 execpgout++;
 452                                         } else {
 453                                                 fspgout++;
 454                                         }
 455                                 }
 456                         }
 457                         page_io_unlock(pp);
 458                         pgout = 1;
 459                         pgpgout++;
 460                         TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
 461                             "page_ws_out:pp %p", pp);
 462 
 463                         /*
 464                          * The page_struct_lock need not be acquired to
 465                          * examine "p_lckcnt" and "p_cowcnt" since we'll
 466                          * have an "exclusive" lock if the upgrade succeeds.
 467                          */
 468                         if (page_tryupgrade(pp) &&
 469                             pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 470                                 /*
 471                                  * Check if someone has reclaimed the
 472                                  * page.  If ref and mod are not set, no
 473                                  * one is using it so we can free it.
 474                                  * The rest of the system is careful
 475                                  * to use the NOSYNC flag to unload
 476                                  * translations set up for i/o w/o
 477                                  * affecting ref and mod bits.
 478                                  *
 479                                  * Obtain a copy of the real hardware
 480                                  * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 481                                  * to avoid having to flush the cache.
 482                                  */
 483                                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 484                                     HAT_SYNC_STOPON_MOD);
 485                         ck_refmod:
 486                                 if (!(ppattr & (P_REF | P_MOD))) {
 487                                         if (hat_page_is_mapped(pp)) {
 488                                                 /*
 489                                                  * Doesn't look like the page
 490                                                  * was modified so now we
 491                                                  * really have to unload the
 492                                                  * translations.  Meanwhile
 493                                                  * another CPU could've
 494                                                  * modified it so we have to
 495                                                  * check again.  We don't loop
 496                                                  * forever here because now
 497                                                  * the translations are gone
 498                                                  * and no one can get a new one
 499                                                  * since we have the "exclusive"
 500                                                  * lock on the page.
 501                                                  */
 502                                                 (void) hat_pageunload(pp,
 503                                                     HAT_FORCE_PGUNLOAD);
 504                                                 ppattr = hat_page_getattr(pp,
 505                                                     P_REF | P_MOD);
 506                                                 goto ck_refmod;
 507                                         }
 508                                         /*
 509                                          * Update statistics for pages being
 510                                          * freed
 511                                          */
 512                                         if (pp->p_vnode) {
 513                                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 514                                                         anonfree++;
 515                                                 } else {
 516                                                         if (pp->p_vnode->v_flag
 517                                                             & VVMEXEC) {
 518                                                                 execfree++;
 519                                                         } else {
 520                                                                 fsfree++;
 521                                                         }
 522                                                 }
 523                                         }
 524                                         /*LINTED: constant in conditional ctx*/
 525                                         VN_DISPOSE(pp, B_FREE,
 526                                             (flags & B_DONTNEED), kcred);
 527                                         dfree++;
 528                                 } else {
 529                                         page_unlock(pp);
 530                                         pgrec++;
 531                                         TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
 532                                             "page_ws_free:pp %p", pp);
 533                                 }
 534                         } else {
 535                                 /*
 536                                  * Page is either `locked' in memory
 537                                  * or was reclaimed and now has a
 538                                  * "shared" lock, so release it.
 539                                  */
 540                                 page_unlock(pp);
 541                         }
 542                 } else {
 543                         /*
 544                          * Neither B_FREE nor B_INVAL nor B_ERROR.
 545                          * Just release locks.
 546                          */
 547                         page_io_unlock(pp);
 548                         page_unlock(pp);
 549                 }
 550         }
 551 
 552         CPU_STATS_ENTER_K();
 553         cpup = CPU;             /* get cpup now that CPU cannot change */
 554         CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 555         CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 556         CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 557         CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 558         CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 559         CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 560         CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 561         CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 562         CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 563         CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 564         CPU_STATS_EXIT_K();
 565 
 566         /* Kernel probe */
 567         TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
 568             tnf_opaque, vnode,                  vp,
 569             tnf_ulong,  pages_pageout,          pgpgout,
 570             tnf_ulong,  pages_freed,            dfree,
 571             tnf_ulong,  pages_reclaimed,        pgrec);
 572 }
 573 
 574 /*
 575  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 576  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
 577  * operation and is only to be considered if it doesn't involve any
 578  * waiting here.  B_TRUNC indicates that the file is being truncated
 579  * and so no i/o needs to be done. B_FORCE indicates that the page
 580  * must be destroyed so don't try wrting it out.
 581  *
 582  * The caller must ensure that the page is locked.  Returns 1, if
 583  * the page should be written back (the "iolock" is held in this
 584  * case), or 0 if the page has been dealt with or has been
 585  * unlocked.
 586  */
 587 int
 588 pvn_getdirty(page_t *pp, int flags)
 589 {
 590         ASSERT((flags & (B_INVAL | B_FREE)) ?
 591             PAGE_EXCL(pp) : PAGE_SHARED(pp));
 592         ASSERT(PP_ISFREE(pp) == 0);
 593 
 594         /*
 595          * If trying to invalidate or free a logically `locked' page,
 596          * forget it.  Don't need page_struct_lock to check p_lckcnt and
 597          * p_cowcnt as the page is exclusively locked.
 598          */
 599         if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 600             (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 601                 page_unlock(pp);
 602                 return (0);
 603         }
 604 
 605         /*
 606          * Now acquire the i/o lock so we can add it to the dirty
 607          * list (if necessary).  We avoid blocking on the i/o lock
 608          * in the following cases:
 609          *
 610          *      If B_DELWRI is set, which implies that this request is
 611          *      due to a klustering operartion.
 612          *
 613          *      If this is an async (B_ASYNC) operation and we are not doing
 614          *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 615          *      that the the page is written out].
 616          */
 617         if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 618                 if (!page_io_trylock(pp)) {
 619                         page_unlock(pp);
 620                         return (0);
 621                 }
 622         } else {
 623                 page_io_lock(pp);
 624         }
 625 
 626         /*
 627          * If we want to free or invalidate the page then
 628          * we need to unload it so that anyone who wants
 629          * it will have to take a minor fault to get it.
 630          * Otherwise, we're just writing the page back so we
 631          * need to sync up the hardwre and software mod bit to
 632          * detect any future modifications.  We clear the
 633          * software mod bit when we put the page on the dirty
 634          * list.
 635          */
 636         if (flags & (B_INVAL | B_FREE)) {
 637                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 638         } else {
 639                 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 640         }
 641 
 642         if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 643                 /*
 644                  * Don't need to add it to the
 645                  * list after all.
 646                  */
 647                 page_io_unlock(pp);
 648                 if (flags & B_INVAL) {
 649                         /*LINTED: constant in conditional context*/
 650                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 651                 } else if (flags & B_FREE) {
 652                         /*LINTED: constant in conditional context*/
 653                         VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 654                 } else {
 655                         /*
 656                          * This is advisory path for the callers
 657                          * of VOP_PUTPAGE() who prefer freeing the
 658                          * page _only_ if no one else is accessing it.
 659                          * E.g. segmap_release()
 660                          *
 661                          * The above hat_ismod() check is useless because:
 662                          * (1) we may not be holding SE_EXCL lock;
 663                          * (2) we've not unloaded _all_ translations
 664                          *
 665                          * Let page_release() do the heavy-lifting.
 666                          */
 667                         (void) page_release(pp, 1);
 668                 }
 669                 return (0);
 670         }
 671 
 672         /*
 673          * Page is dirty, get it ready for the write back
 674          * and add page to the dirty list.
 675          */
 676         hat_clrrefmod(pp);
 677 
 678         /*
 679          * If we're going to free the page when we're done
 680          * then we can let others try to use it starting now.
 681          * We'll detect the fact that they used it when the
 682          * i/o is done and avoid freeing the page.
 683          */
 684         if (flags & B_FREE)
 685                 page_downgrade(pp);
 686 
 687 
 688         TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
 689 
 690         return (1);
 691 }
 692 
 693 
 694 /*ARGSUSED*/
 695 static int
 696 marker_constructor(void *buf, void *cdrarg, int kmflags)
 697 {
 698         page_t *mark = buf;
 699         bzero(mark, sizeof (page_t));
 700         mark->p_hash = PVN_VPLIST_HASH_TAG;
 701         return (0);
 702 }
 703 
 704 void
 705 pvn_init()
 706 {
 707         if (pvn_vmodsort_disable == 0)
 708                 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
 709         marker_cache = kmem_cache_create("marker_cache",
 710             sizeof (page_t), 0, marker_constructor,
 711             NULL, NULL, NULL, NULL, 0);
 712 }
 713 
 714 
 715 /*
 716  * Process a vnode's page list for all pages whose offset is >= off.
 717  * Pages are to either be free'd, invalidated, or written back to disk.
 718  *
 719  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 720  * is specified, otherwise they are "shared" locked.
 721  *
 722  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 723  *
 724  * Special marker page_t's are inserted in the list in order
 725  * to keep track of where we are in the list when locks are dropped.
 726  *
 727  * Note the list is circular and insertions can happen only at the
 728  * head and tail of the list. The algorithm ensures visiting all pages
 729  * on the list in the following way:
 730  *
 731  *    Drop two marker pages at the end of the list.
 732  *
 733  *    Move one marker page backwards towards the start of the list until
 734  *    it is at the list head, processing the pages passed along the way.
 735  *
 736  *    Due to race conditions when the vphm mutex is dropped, additional pages
 737  *    can be added to either end of the list, so we'll continue to move
 738  *    the marker and process pages until it is up against the end marker.
 739  *
 740  * There is one special exit condition. If we are processing a VMODSORT
 741  * vnode and only writing back modified pages, we can stop as soon as
 742  * we run into an unmodified page.  This makes fsync(3) operations fast.
 743  */
 744 int
 745 pvn_vplist_dirty(
 746         vnode_t         *vp,
 747         u_offset_t      off,
 748         int             (*putapage)(vnode_t *, page_t *, u_offset_t *,
 749                         size_t *, int, cred_t *),
 750         int             flags,
 751         cred_t          *cred)
 752 {
 753         page_t          *pp;
 754         page_t          *mark;          /* marker page that moves toward head */
 755         page_t          *end;           /* marker page at end of list */
 756         int             err = 0;
 757         int             error;
 758         kmutex_t        *vphm;
 759         se_t            se;
 760         page_t          **where_to_move;
 761 
 762         ASSERT(vp->v_type != VCHR);
 763 
 764         if (vp->v_pages == NULL)
 765                 return (0);
 766 
 767 
 768         /*
 769          * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 770          *
 771          * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 772          * from getting blocked while flushing pages to a dead NFS server.
 773          */
 774         mutex_enter(&vp->v_lock);
 775         if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 776                 mutex_exit(&vp->v_lock);
 777                 return (EAGAIN);
 778         }
 779 
 780         while (vp->v_flag & VVMLOCK)
 781                 cv_wait(&vp->v_cv, &vp->v_lock);
 782 
 783         if (vp->v_pages == NULL) {
 784                 mutex_exit(&vp->v_lock);
 785                 return (0);
 786         }
 787 
 788         vp->v_flag |= VVMLOCK;
 789         mutex_exit(&vp->v_lock);
 790 
 791 
 792         /*
 793          * Set up the marker pages used to walk the list
 794          */
 795         end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 796         end->p_vnode = vp;
 797         end->p_offset = (u_offset_t)-2;
 798         mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 799         mark->p_vnode = vp;
 800         mark->p_offset = (u_offset_t)-1;
 801 
 802         /*
 803          * Grab the lock protecting the vnode's page list
 804          * note that this lock is dropped at times in the loop.
 805          */
 806         vphm = page_vnode_mutex(vp);
 807         mutex_enter(vphm);
 808         if (vp->v_pages == NULL)
 809                 goto leave;
 810 
 811         /*
 812          * insert the markers and loop through the list of pages
 813          */
 814         page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
 815         page_vpadd(&mark->p_vpnext, end);
 816         for (;;) {
 817 
 818                 /*
 819                  * If only doing an async write back, then we can
 820                  * stop as soon as we get to start of the list.
 821                  */
 822                 if (flags == B_ASYNC && vp->v_pages == mark)
 823                         break;
 824 
 825                 /*
 826                  * otherwise stop when we've gone through all the pages
 827                  */
 828                 if (mark->p_vpprev == end)
 829                         break;
 830 
 831                 pp = mark->p_vpprev;
 832                 if (vp->v_pages == pp)
 833                         where_to_move = &vp->v_pages;
 834                 else
 835                         where_to_move = &pp->p_vpprev->p_vpnext;
 836 
 837                 ASSERT(pp->p_vnode == vp);
 838 
 839                 /*
 840                  * If just flushing dirty pages to disk and this vnode
 841                  * is using a sorted list of pages, we can stop processing
 842                  * as soon as we find an unmodified page. Since all the
 843                  * modified pages are visited first.
 844                  */
 845                 if (IS_VMODSORT(vp) &&
 846                     !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 847                         if (!hat_ismod(pp) && !page_io_locked(pp)) {
 848 #ifdef  DEBUG
 849                                 /*
 850                                  * For debug kernels examine what should be
 851                                  * all the remaining clean pages, asserting
 852                                  * that they are not modified.
 853                                  */
 854                                 page_t  *chk = pp;
 855                                 int     attr;
 856 
 857                                 page_vpsub(&vp->v_pages, mark);
 858                                 page_vpadd(where_to_move, mark);
 859                                 do {
 860                                         chk = chk->p_vpprev;
 861                                         ASSERT(chk != end);
 862                                         if (chk == mark)
 863                                                 continue;
 864                                         attr = hat_page_getattr(chk, P_MOD |
 865                                             P_REF);
 866                                         if ((attr & P_MOD) == 0)
 867                                                 continue;
 868                                         panic("v_pages list not all clean: "
 869                                             "page_t*=%p vnode=%p off=%lx "
 870                                             "attr=0x%x last clean page_t*=%p\n",
 871                                             (void *)chk, (void *)chk->p_vnode,
 872                                             (long)chk->p_offset, attr,
 873                                             (void *)pp);
 874                                 } while (chk != vp->v_pages);
 875 #endif
 876                                 break;
 877                         } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 878                                 /*
 879                                  * Couldn't get io lock, wait until IO is done.
 880                                  * Block only for sync IO since we don't want
 881                                  * to block async IO.
 882                                  */
 883                                 mutex_exit(vphm);
 884                                 page_io_wait(pp);
 885                                 mutex_enter(vphm);
 886                                 continue;
 887                         }
 888                 }
 889 
 890                 /*
 891                  * Skip this page if the offset is out of the desired range.
 892                  * Just move the marker and continue.
 893                  */
 894                 if (pp->p_offset < off) {
 895                         page_vpsub(&vp->v_pages, mark);
 896                         page_vpadd(where_to_move, mark);
 897                         continue;
 898                 }
 899 
 900                 /*
 901                  * If we are supposed to invalidate or free this
 902                  * page, then we need an exclusive lock.
 903                  */
 904                 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 905 
 906                 /*
 907                  * We must acquire the page lock for all synchronous
 908                  * operations (invalidate, free and write).
 909                  */
 910                 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 911                         /*
 912                          * If the page_lock() drops the mutex
 913                          * we must retry the loop.
 914                          */
 915                         if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
 916                                 continue;
 917 
 918                         /*
 919                          * It's ok to move the marker page now.
 920                          */
 921                         page_vpsub(&vp->v_pages, mark);
 922                         page_vpadd(where_to_move, mark);
 923                 } else {
 924 
 925                         /*
 926                          * update the marker page for all remaining cases
 927                          */
 928                         page_vpsub(&vp->v_pages, mark);
 929                         page_vpadd(where_to_move, mark);
 930 
 931                         /*
 932                          * For write backs, If we can't lock the page, it's
 933                          * invalid or in the process of being destroyed.  Skip
 934                          * it, assuming someone else is writing it.
 935                          */
 936                         if (!page_trylock(pp, se))
 937                                 continue;
 938                 }
 939 
 940                 ASSERT(pp->p_vnode == vp);
 941 
 942                 /*
 943                  * Successfully locked the page, now figure out what to
 944                  * do with it. Free pages are easily dealt with, invalidate
 945                  * if desired or just go on to the next page.
 946                  */
 947                 if (PP_ISFREE(pp)) {
 948                         if ((flags & B_INVAL) == 0) {
 949                                 page_unlock(pp);
 950                                 continue;
 951                         }
 952 
 953                         /*
 954                          * Invalidate (destroy) the page.
 955                          */
 956                         mutex_exit(vphm);
 957                         page_destroy_free(pp);
 958                         mutex_enter(vphm);
 959                         continue;
 960                 }
 961 
 962                 /*
 963                  * pvn_getdirty() figures out what do do with a dirty page.
 964                  * If the page is dirty, the putapage() routine will write it
 965                  * and will kluster any other adjacent dirty pages it can.
 966                  *
 967                  * pvn_getdirty() and `(*putapage)' unlock the page.
 968                  */
 969                 mutex_exit(vphm);
 970                 if (pvn_getdirty(pp, flags)) {
 971                         error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 972                         if (!err)
 973                                 err = error;
 974                 }
 975                 mutex_enter(vphm);
 976         }
 977         page_vpsub(&vp->v_pages, mark);
 978         page_vpsub(&vp->v_pages, end);
 979 
 980 leave:
 981         /*
 982          * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
 983          */
 984         mutex_exit(vphm);
 985         kmem_cache_free(marker_cache, mark);
 986         kmem_cache_free(marker_cache, end);
 987         mutex_enter(&vp->v_lock);
 988         vp->v_flag &= ~VVMLOCK;
 989         cv_broadcast(&vp->v_cv);
 990         mutex_exit(&vp->v_lock);
 991         return (err);
 992 }
 993 
 994 /*
 995  * Walk the vp->v_pages list, for every page call the callback function
 996  * pointed by *page_check. If page_check returns non-zero, then mark the
 997  * page as modified and if VMODSORT is set, move it to the end of v_pages
 998  * list. Moving makes sense only if we have at least two pages - this also
 999  * avoids having v_pages temporarily being NULL after calling page_vpsub()
1000  * if there was just one page.
1001  */
1002 void
1003 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1004 {
1005         page_t  *pp, *next, *end;
1006         kmutex_t        *vphm;
1007         int     shuffle;
1008 
1009         vphm = page_vnode_mutex(vp);
1010         mutex_enter(vphm);
1011 
1012         if (vp->v_pages == NULL) {
1013                 mutex_exit(vphm);
1014                 return;
1015         }
1016 
1017         end = vp->v_pages->p_vpprev;
1018         shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1019         pp = vp->v_pages;
1020 
1021         for (;;) {
1022                 next = pp->p_vpnext;
1023                 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1024                         /*
1025                          * hat_setmod_only() in contrast to hat_setmod() does
1026                          * not shuffle the pages and does not grab the mutex
1027                          * page_vnode_mutex. Exactly what we need.
1028                          */
1029                         hat_setmod_only(pp);
1030                         if (shuffle) {
1031                                 page_vpsub(&vp->v_pages, pp);
1032                                 ASSERT(vp->v_pages != NULL);
1033                                 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1034                                     pp);
1035                         }
1036                 }
1037                 /* Stop if we have just processed the last page. */
1038                 if (pp == end)
1039                         break;
1040                 pp = next;
1041         }
1042 
1043         mutex_exit(vphm);
1044 }
1045 
1046 /*
1047  * Zero out zbytes worth of data. Caller should be aware that this
1048  * routine may enter back into the fs layer (xxx_getpage). Locks
1049  * that the xxx_getpage routine may need should not be held while
1050  * calling this.
1051  */
1052 void
1053 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1054 {
1055         caddr_t addr;
1056 
1057         ASSERT(vp->v_type != VCHR);
1058 
1059         if (vp->v_pages == NULL)
1060                 return;
1061 
1062         /*
1063          * zbytes may be zero but there still may be some portion of
1064          * a page which needs clearing (since zbytes is a function
1065          * of filesystem block size, not pagesize.)
1066          */
1067         if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1068                 return;
1069 
1070         /*
1071          * We get the last page and handle the partial
1072          * zeroing via kernel mappings.  This will make the page
1073          * dirty so that we know that when this page is written
1074          * back, the zeroed information will go out with it.  If
1075          * the page is not currently in memory, then the kzero
1076          * operation will cause it to be brought it.  We use kzero
1077          * instead of bzero so that if the page cannot be read in
1078          * for any reason, the system will not panic.  We need
1079          * to zero out a minimum of the fs given zbytes, but we
1080          * might also have to do more to get the entire last page.
1081          */
1082 
1083         if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1084                 panic("pvn_vptrunc zbytes");
1085         addr = segmap_getmapflt(segkmap, vp, vplen,
1086             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1087         (void) kzero(addr + (vplen & MAXBOFFSET),
1088             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1089         (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1090 }
1091 
1092 /*
1093  * Handles common work of the VOP_GETPAGE routines when more than
1094  * one page must be returned by calling a file system specific operation
1095  * to do most of the work.  Must be called with the vp already locked
1096  * by the VOP_GETPAGE routine.
1097  */
1098 int
1099 pvn_getpages(
1100         int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1101                 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1102         struct vnode *vp,
1103         u_offset_t off,
1104         size_t len,
1105         uint_t *protp,
1106         page_t *pl[],
1107         size_t plsz,
1108         struct seg *seg,
1109         caddr_t addr,
1110         enum seg_rw rw,
1111         struct cred *cred)
1112 {
1113         page_t **ppp;
1114         u_offset_t o, eoff;
1115         size_t sz, xlen;
1116         int err;
1117 
1118         ASSERT(plsz >= len);         /* insure that we have enough space */
1119 
1120         /*
1121          * Loop one page at a time and let getapage function fill
1122          * in the next page in array.  We only allow one page to be
1123          * returned at a time (except for the last page) so that we
1124          * don't have any problems with duplicates and other such
1125          * painful problems.  This is a very simple minded algorithm,
1126          * but it does the job correctly.  We hope that the cost of a
1127          * getapage call for a resident page that we might have been
1128          * able to get from an earlier call doesn't cost too much.
1129          */
1130         ppp = pl;
1131         sz = PAGESIZE;
1132         eoff = off + len;
1133         xlen = len;
1134         for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135             xlen -= PAGESIZE) {
1136                 if (o + PAGESIZE >= eoff) {
1137                         /*
1138                          * Last time through - allow the all of
1139                          * what's left of the pl[] array to be used.
1140                          */
1141                         sz = plsz - (o - off);
1142                 }
1143                 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144                     rw, cred);
1145                 if (err) {
1146                         /*
1147                          * Release any pages we already got.
1148                          */
1149                         if (o > off && pl != NULL) {
1150                                 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151                                         (void) page_release(*ppp, 1);
1152                         }
1153                         break;
1154                 }
1155                 if (pl != NULL)
1156                         ppp++;
1157         }
1158         return (err);
1159 }
1160 
1161 /*
1162  * Initialize the page list array.
1163  */
1164 /*ARGSUSED*/
1165 void
1166 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167     u_offset_t off, size_t io_len, enum seg_rw rw)
1168 {
1169         ssize_t sz;
1170         page_t *ppcur, **ppp;
1171 
1172         /*
1173          * Set up to load plsz worth
1174          * starting at the needed page.
1175          */
1176         while (pp != NULL && pp->p_offset != off) {
1177                 /*
1178                  * Remove page from the i/o list,
1179                  * release the i/o and the page lock.
1180                  */
1181                 ppcur = pp;
1182                 page_sub(&pp, ppcur);
1183                 page_io_unlock(ppcur);
1184                 (void) page_release(ppcur, 1);
1185         }
1186 
1187         if (pp == NULL) {
1188                 pl[0] = NULL;
1189                 return;
1190         }
1191 
1192         sz = plsz;
1193 
1194         /*
1195          * Initialize the page list array.
1196          */
1197         ppp = pl;
1198         do {
1199                 ppcur = pp;
1200                 *ppp++ = ppcur;
1201                 page_sub(&pp, ppcur);
1202                 page_io_unlock(ppcur);
1203                 if (rw != S_CREATE)
1204                         page_downgrade(ppcur);
1205                 sz -= PAGESIZE;
1206         } while (sz > 0 && pp != NULL);
1207         *ppp = NULL;            /* terminate list */
1208 
1209         /*
1210          * Now free the remaining pages that weren't
1211          * loaded in the page list.
1212          */
1213         while (pp != NULL) {
1214                 ppcur = pp;
1215                 page_sub(&pp, ppcur);
1216                 page_io_unlock(ppcur);
1217                 (void) page_release(ppcur, 1);
1218         }
1219 }