1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015, Joyent, Inc.  All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 /*
  41  * VM - address spaces.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/param.h>
  47 #include <sys/errno.h>
  48 #include <sys/systm.h>
  49 #include <sys/mman.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/cpuvar.h>
  52 #include <sys/sysinfo.h>
  53 #include <sys/kmem.h>
  54 #include <sys/vnode.h>
  55 #include <sys/vmsystm.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/debug.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/vtrace.h>
  60 
  61 #include <vm/hat.h>
  62 #include <vm/xhat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_vn.h>
  66 #include <vm/seg_dev.h>
  67 #include <vm/seg_kmem.h>
  68 #include <vm/seg_map.h>
  69 #include <vm/seg_spt.h>
  70 #include <vm/page.h>
  71 
  72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  73 
  74 static struct kmem_cache *as_cache;
  75 
  76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  79 
  80 
  81 /*
  82  * Verifying the segment lists is very time-consuming; it may not be
  83  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  84  */
  85 #ifdef DEBUG
  86 #define VERIFY_SEGLIST
  87 int do_as_verify = 0;
  88 #endif
  89 
  90 /*
  91  * Allocate a new callback data structure entry and fill in the events of
  92  * interest, the address range of interest, and the callback argument.
  93  * Link the entry on the as->a_callbacks list. A callback entry for the
  94  * entire address space may be specified with vaddr = 0 and size = -1.
  95  *
  96  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  97  * the specified as, the caller must guarantee persistence of the specified as
  98  * for the duration of this function (eg. pages being locked within the as
  99  * will guarantee persistence).
 100  */
 101 int
 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 103                 caddr_t vaddr, size_t size, int sleepflag)
 104 {
 105         struct as_callback      *current_head, *cb;
 106         caddr_t                 saddr;
 107         size_t                  rsize;
 108 
 109         /* callback function and an event are mandatory */
 110         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 111                 return (EINVAL);
 112 
 113         /* Adding a callback after as_free has been called is not allowed */
 114         if (as == &kas)
 115                 return (ENOMEM);
 116 
 117         /*
 118          * vaddr = 0 and size = -1 is used to indicate that the callback range
 119          * is the entire address space so no rounding is done in that case.
 120          */
 121         if (size != -1) {
 122                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 123                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 124                     (size_t)saddr;
 125                 /* check for wraparound */
 126                 if (saddr + rsize < saddr)
 127                         return (ENOMEM);
 128         } else {
 129                 if (vaddr != 0)
 130                         return (EINVAL);
 131                 saddr = vaddr;
 132                 rsize = size;
 133         }
 134 
 135         /* Allocate and initialize a callback entry */
 136         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 137         if (cb == NULL)
 138                 return (EAGAIN);
 139 
 140         cb->ascb_func = cb_func;
 141         cb->ascb_arg = arg;
 142         cb->ascb_events = events;
 143         cb->ascb_saddr = saddr;
 144         cb->ascb_len = rsize;
 145 
 146         /* Add the entry to the list */
 147         mutex_enter(&as->a_contents);
 148         current_head = as->a_callbacks;
 149         as->a_callbacks = cb;
 150         cb->ascb_next = current_head;
 151 
 152         /*
 153          * The call to this function may lose in a race with
 154          * a pertinent event - eg. a thread does long term memory locking
 155          * but before the callback is added another thread executes as_unmap.
 156          * A broadcast here resolves that.
 157          */
 158         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 159                 AS_CLRUNMAPWAIT(as);
 160                 cv_broadcast(&as->a_cv);
 161         }
 162 
 163         mutex_exit(&as->a_contents);
 164         return (0);
 165 }
 166 
 167 /*
 168  * Search the callback list for an entry which pertains to arg.
 169  *
 170  * This is called from within the client upon completion of the callback.
 171  * RETURN VALUES:
 172  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 173  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 174  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 175  *                      entry will be made in as_do_callbacks)
 176  *
 177  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 178  * set, it indicates that as_do_callbacks is processing this entry.  The
 179  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 180  * to unblock as_do_callbacks, in case it is blocked.
 181  *
 182  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 183  * the specified as, the caller must guarantee persistence of the specified as
 184  * for the duration of this function (eg. pages being locked within the as
 185  * will guarantee persistence).
 186  */
 187 uint_t
 188 as_delete_callback(struct as *as, void *arg)
 189 {
 190         struct as_callback **prevcb = &as->a_callbacks;
 191         struct as_callback *cb;
 192         uint_t rc = AS_CALLBACK_NOTFOUND;
 193 
 194         mutex_enter(&as->a_contents);
 195         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 196                 if (cb->ascb_arg != arg)
 197                         continue;
 198 
 199                 /*
 200                  * If the events indicate AS_CALLBACK_CALLED, just clear
 201                  * AS_ALL_EVENT in the events field and wakeup the thread
 202                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 203                  * will take care of removing this entry from the list.  In
 204                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 205                  * (AS_CALLBACK_CALLED not set), just remove it from the
 206                  * list, return the memory and return AS_CALLBACK_DELETED.
 207                  */
 208                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 209                         /* leave AS_CALLBACK_CALLED */
 210                         cb->ascb_events &= ~AS_ALL_EVENT;
 211                         rc = AS_CALLBACK_DELETE_DEFERRED;
 212                         cv_broadcast(&as->a_cv);
 213                 } else {
 214                         *prevcb = cb->ascb_next;
 215                         kmem_free(cb, sizeof (struct as_callback));
 216                         rc = AS_CALLBACK_DELETED;
 217                 }
 218                 break;
 219         }
 220         mutex_exit(&as->a_contents);
 221         return (rc);
 222 }
 223 
 224 /*
 225  * Searches the as callback list for a matching entry.
 226  * Returns a pointer to the first matching callback, or NULL if
 227  * nothing is found.
 228  * This function never sleeps so it is ok to call it with more
 229  * locks held but the (required) a_contents mutex.
 230  *
 231  * See also comment on as_do_callbacks below.
 232  */
 233 static struct as_callback *
 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 235                         size_t event_len)
 236 {
 237         struct as_callback      *cb;
 238 
 239         ASSERT(MUTEX_HELD(&as->a_contents));
 240         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 241                 /*
 242                  * If the callback has not already been called, then
 243                  * check if events or address range pertains.  An event_len
 244                  * of zero means do an unconditional callback.
 245                  */
 246                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 247                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 248                     (event_addr + event_len < cb->ascb_saddr) ||
 249                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 250                         continue;
 251                 }
 252                 break;
 253         }
 254         return (cb);
 255 }
 256 
 257 /*
 258  * Executes a given callback and removes it from the callback list for
 259  * this address space.
 260  * This function may sleep so the caller must drop all locks except
 261  * a_contents before calling this func.
 262  *
 263  * See also comments on as_do_callbacks below.
 264  */
 265 static void
 266 as_execute_callback(struct as *as, struct as_callback *cb,
 267                                 uint_t events)
 268 {
 269         struct as_callback **prevcb;
 270         void    *cb_arg;
 271 
 272         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 273         cb->ascb_events |= AS_CALLBACK_CALLED;
 274         mutex_exit(&as->a_contents);
 275         (*cb->ascb_func)(as, cb->ascb_arg, events);
 276         mutex_enter(&as->a_contents);
 277         /*
 278          * the callback function is required to delete the callback
 279          * when the callback function determines it is OK for
 280          * this thread to continue. as_delete_callback will clear
 281          * the AS_ALL_EVENT in the events field when it is deleted.
 282          * If the callback function called as_delete_callback,
 283          * events will already be cleared and there will be no blocking.
 284          */
 285         while ((cb->ascb_events & events) != 0) {
 286                 cv_wait(&as->a_cv, &as->a_contents);
 287         }
 288         /*
 289          * This entry needs to be taken off the list. Normally, the
 290          * callback func itself does that, but unfortunately the list
 291          * may have changed while the callback was running because the
 292          * a_contents mutex was dropped and someone else other than the
 293          * callback func itself could have called as_delete_callback,
 294          * so we have to search to find this entry again.  The entry
 295          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 296          */
 297         cb_arg = cb->ascb_arg;
 298         prevcb = &as->a_callbacks;
 299         for (cb = as->a_callbacks; cb != NULL;
 300             prevcb = &cb->ascb_next, cb = *prevcb) {
 301                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 302                     (cb_arg != cb->ascb_arg)) {
 303                         continue;
 304                 }
 305                 *prevcb = cb->ascb_next;
 306                 kmem_free(cb, sizeof (struct as_callback));
 307                 break;
 308         }
 309 }
 310 
 311 /*
 312  * Check the callback list for a matching event and intersection of
 313  * address range. If there is a match invoke the callback.  Skip an entry if:
 314  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 315  *    - not event of interest
 316  *    - not address range of interest
 317  *
 318  * An event_len of zero indicates a request for an unconditional callback
 319  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 320  * a_contents lock must be dropped before a callback, so only one callback
 321  * can be done before returning. Return -1 (true) if a callback was
 322  * executed and removed from the list, else return 0 (false).
 323  *
 324  * The logically separate parts, i.e. finding a matching callback and
 325  * executing a given callback have been separated into two functions
 326  * so that they can be called with different sets of locks held beyond
 327  * the always-required a_contents. as_find_callback does not sleep so
 328  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 329  * rwlock) are held. as_execute_callback on the other hand may sleep
 330  * so all locks beyond a_contents must be dropped by the caller if one
 331  * does not want to end comatose.
 332  */
 333 static int
 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 335                         size_t event_len)
 336 {
 337         struct as_callback *cb;
 338 
 339         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 340                 as_execute_callback(as, cb, events);
 341                 return (-1);
 342         }
 343         return (0);
 344 }
 345 
 346 /*
 347  * Search for the segment containing addr. If a segment containing addr
 348  * exists, that segment is returned.  If no such segment exists, and
 349  * the list spans addresses greater than addr, then the first segment
 350  * whose base is greater than addr is returned; otherwise, NULL is
 351  * returned unless tail is true, in which case the last element of the
 352  * list is returned.
 353  *
 354  * a_seglast is used to cache the last found segment for repeated
 355  * searches to the same addr (which happens frequently).
 356  */
 357 struct seg *
 358 as_findseg(struct as *as, caddr_t addr, int tail)
 359 {
 360         struct seg *seg = as->a_seglast;
 361         avl_index_t where;
 362 
 363         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 364 
 365         if (seg != NULL &&
 366             seg->s_base <= addr &&
 367             addr < seg->s_base + seg->s_size)
 368                 return (seg);
 369 
 370         seg = avl_find(&as->a_segtree, &addr, &where);
 371         if (seg != NULL)
 372                 return (as->a_seglast = seg);
 373 
 374         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 375         if (seg == NULL && tail)
 376                 seg = avl_last(&as->a_segtree);
 377         return (as->a_seglast = seg);
 378 }
 379 
 380 #ifdef VERIFY_SEGLIST
 381 /*
 382  * verify that the linked list is coherent
 383  */
 384 static void
 385 as_verify(struct as *as)
 386 {
 387         struct seg *seg, *seglast, *p, *n;
 388         uint_t nsegs = 0;
 389 
 390         if (do_as_verify == 0)
 391                 return;
 392 
 393         seglast = as->a_seglast;
 394 
 395         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 396                 ASSERT(seg->s_as == as);
 397                 p = AS_SEGPREV(as, seg);
 398                 n = AS_SEGNEXT(as, seg);
 399                 ASSERT(p == NULL || p->s_as == as);
 400                 ASSERT(p == NULL || p->s_base < seg->s_base);
 401                 ASSERT(n == NULL || n->s_base > seg->s_base);
 402                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 403                 if (seg == seglast)
 404                         seglast = NULL;
 405                 nsegs++;
 406         }
 407         ASSERT(seglast == NULL);
 408         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 409 }
 410 #endif /* VERIFY_SEGLIST */
 411 
 412 /*
 413  * Add a new segment to the address space. The avl_find()
 414  * may be expensive so we attempt to use last segment accessed
 415  * in as_gap() as an insertion point.
 416  */
 417 int
 418 as_addseg(struct as  *as, struct seg *newseg)
 419 {
 420         struct seg *seg;
 421         caddr_t addr;
 422         caddr_t eaddr;
 423         avl_index_t where;
 424 
 425         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 426 
 427         as->a_updatedir = 1; /* inform /proc */
 428         gethrestime(&as->a_updatetime);
 429 
 430         if (as->a_lastgaphl != NULL) {
 431                 struct seg *hseg = NULL;
 432                 struct seg *lseg = NULL;
 433 
 434                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 435                         hseg = as->a_lastgaphl;
 436                         lseg = AVL_PREV(&as->a_segtree, hseg);
 437                 } else {
 438                         lseg = as->a_lastgaphl;
 439                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 440                 }
 441 
 442                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 443                     hseg->s_base > newseg->s_base) {
 444                         avl_insert_here(&as->a_segtree, newseg, lseg,
 445                             AVL_AFTER);
 446                         as->a_lastgaphl = NULL;
 447                         as->a_seglast = newseg;
 448                         return (0);
 449                 }
 450                 as->a_lastgaphl = NULL;
 451         }
 452 
 453         addr = newseg->s_base;
 454         eaddr = addr + newseg->s_size;
 455 again:
 456 
 457         seg = avl_find(&as->a_segtree, &addr, &where);
 458 
 459         if (seg == NULL)
 460                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 461 
 462         if (seg == NULL)
 463                 seg = avl_last(&as->a_segtree);
 464 
 465         if (seg != NULL) {
 466                 caddr_t base = seg->s_base;
 467 
 468                 /*
 469                  * If top of seg is below the requested address, then
 470                  * the insertion point is at the end of the linked list,
 471                  * and seg points to the tail of the list.  Otherwise,
 472                  * the insertion point is immediately before seg.
 473                  */
 474                 if (base + seg->s_size > addr) {
 475                         if (addr >= base || eaddr > base) {
 476 #ifdef __sparc
 477                                 extern const struct seg_ops segnf_ops;
 478 
 479                                 /*
 480                                  * no-fault segs must disappear if overlaid.
 481                                  * XXX need new segment type so
 482                                  * we don't have to check s_ops
 483                                  */
 484                                 if (seg->s_ops == &segnf_ops) {
 485                                         seg_unmap(seg);
 486                                         goto again;
 487                                 }
 488 #endif
 489                                 return (-1);    /* overlapping segment */
 490                         }
 491                 }
 492         }
 493         as->a_seglast = newseg;
 494         avl_insert(&as->a_segtree, newseg, where);
 495 
 496 #ifdef VERIFY_SEGLIST
 497         as_verify(as);
 498 #endif
 499         return (0);
 500 }
 501 
 502 struct seg *
 503 as_removeseg(struct as *as, struct seg *seg)
 504 {
 505         avl_tree_t *t;
 506 
 507         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
 508 
 509         as->a_updatedir = 1; /* inform /proc */
 510         gethrestime(&as->a_updatetime);
 511 
 512         if (seg == NULL)
 513                 return (NULL);
 514 
 515         t = &as->a_segtree;
 516         if (as->a_seglast == seg)
 517                 as->a_seglast = NULL;
 518         as->a_lastgaphl = NULL;
 519 
 520         /*
 521          * if this segment is at an address higher than
 522          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 523          */
 524         if (as->a_lastgap &&
 525             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 526                 as->a_lastgap = AVL_NEXT(t, seg);
 527 
 528         /*
 529          * remove the segment from the seg tree
 530          */
 531         avl_remove(t, seg);
 532 
 533 #ifdef VERIFY_SEGLIST
 534         as_verify(as);
 535 #endif
 536         return (seg);
 537 }
 538 
 539 /*
 540  * Find a segment containing addr.
 541  */
 542 struct seg *
 543 as_segat(struct as *as, caddr_t addr)
 544 {
 545         struct seg *seg = as->a_seglast;
 546 
 547         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
 548 
 549         if (seg != NULL && seg->s_base <= addr &&
 550             addr < seg->s_base + seg->s_size)
 551                 return (seg);
 552 
 553         seg = avl_find(&as->a_segtree, &addr, NULL);
 554         return (seg);
 555 }
 556 
 557 /*
 558  * Serialize all searches for holes in an address space to
 559  * prevent two or more threads from allocating the same virtual
 560  * address range.  The address space must not be "read/write"
 561  * locked by the caller since we may block.
 562  */
 563 void
 564 as_rangelock(struct as *as)
 565 {
 566         mutex_enter(&as->a_contents);
 567         while (AS_ISCLAIMGAP(as))
 568                 cv_wait(&as->a_cv, &as->a_contents);
 569         AS_SETCLAIMGAP(as);
 570         mutex_exit(&as->a_contents);
 571 }
 572 
 573 /*
 574  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 575  */
 576 void
 577 as_rangeunlock(struct as *as)
 578 {
 579         mutex_enter(&as->a_contents);
 580         AS_CLRCLAIMGAP(as);
 581         cv_signal(&as->a_cv);
 582         mutex_exit(&as->a_contents);
 583 }
 584 
 585 /*
 586  * compar segments (or just an address) by segment address range
 587  */
 588 static int
 589 as_segcompar(const void *x, const void *y)
 590 {
 591         struct seg *a = (struct seg *)x;
 592         struct seg *b = (struct seg *)y;
 593 
 594         if (a->s_base < b->s_base)
 595                 return (-1);
 596         if (a->s_base >= b->s_base + b->s_size)
 597                 return (1);
 598         return (0);
 599 }
 600 
 601 
 602 void
 603 as_avlinit(struct as *as)
 604 {
 605         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 606             offsetof(struct seg, s_tree));
 607         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 608             offsetof(struct watched_page, wp_link));
 609 }
 610 
 611 /*ARGSUSED*/
 612 static int
 613 as_constructor(void *buf, void *cdrarg, int kmflags)
 614 {
 615         struct as *as = buf;
 616 
 617         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 618         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 619         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 620         as_avlinit(as);
 621         return (0);
 622 }
 623 
 624 /*ARGSUSED1*/
 625 static void
 626 as_destructor(void *buf, void *cdrarg)
 627 {
 628         struct as *as = buf;
 629 
 630         avl_destroy(&as->a_segtree);
 631         mutex_destroy(&as->a_contents);
 632         cv_destroy(&as->a_cv);
 633         rw_destroy(&as->a_lock);
 634 }
 635 
 636 void
 637 as_init(void)
 638 {
 639         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 640             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 641 }
 642 
 643 /*
 644  * Allocate and initialize an address space data structure.
 645  * We call hat_alloc to allow any machine dependent
 646  * information in the hat structure to be initialized.
 647  */
 648 struct as *
 649 as_alloc(void)
 650 {
 651         struct as *as;
 652 
 653         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 654 
 655         as->a_flags          = 0;
 656         as->a_vbits          = 0;
 657         as->a_hrm            = NULL;
 658         as->a_seglast                = NULL;
 659         as->a_size           = 0;
 660         as->a_resvsize               = 0;
 661         as->a_updatedir              = 0;
 662         gethrestime(&as->a_updatetime);
 663         as->a_objectdir              = NULL;
 664         as->a_sizedir                = 0;
 665         as->a_userlimit              = (caddr_t)USERLIMIT;
 666         as->a_lastgap                = NULL;
 667         as->a_lastgaphl              = NULL;
 668         as->a_callbacks              = NULL;
 669 
 670         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 671         as->a_hat = hat_alloc(as);   /* create hat for default system mmu */
 672         AS_LOCK_EXIT(as, &as->a_lock);
 673 
 674         as->a_xhat = NULL;
 675 
 676         return (as);
 677 }
 678 
 679 /*
 680  * Free an address space data structure.
 681  * Need to free the hat first and then
 682  * all the segments on this as and finally
 683  * the space for the as struct itself.
 684  */
 685 void
 686 as_free(struct as *as)
 687 {
 688         struct hat *hat = as->a_hat;
 689         struct seg *seg, *next;
 690         int called = 0;
 691 
 692 top:
 693         /*
 694          * Invoke ALL callbacks. as_do_callbacks will do one callback
 695          * per call, and not return (-1) until the callback has completed.
 696          * When as_do_callbacks returns zero, all callbacks have completed.
 697          */
 698         mutex_enter(&as->a_contents);
 699         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 700                 ;
 701 
 702         /* This will prevent new XHATs from attaching to as */
 703         if (!called)
 704                 AS_SETBUSY(as);
 705         mutex_exit(&as->a_contents);
 706         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 707 
 708         if (!called) {
 709                 called = 1;
 710                 hat_free_start(hat);
 711                 if (as->a_xhat != NULL)
 712                         xhat_free_start_all(as);
 713         }
 714         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 715                 int err;
 716 
 717                 next = AS_SEGNEXT(as, seg);
 718 retry:
 719                 err = segop_unmap(seg, seg->s_base, seg->s_size);
 720                 if (err == EAGAIN) {
 721                         mutex_enter(&as->a_contents);
 722                         if (as->a_callbacks) {
 723                                 AS_LOCK_EXIT(as, &as->a_lock);
 724                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 725                                 /*
 726                                  * Memory is currently locked. Wait for a
 727                                  * cv_signal that it has been unlocked, then
 728                                  * try the operation again.
 729                                  */
 730                                 if (AS_ISUNMAPWAIT(as) == 0)
 731                                         cv_broadcast(&as->a_cv);
 732                                 AS_SETUNMAPWAIT(as);
 733                                 AS_LOCK_EXIT(as, &as->a_lock);
 734                                 while (AS_ISUNMAPWAIT(as))
 735                                         cv_wait(&as->a_cv, &as->a_contents);
 736                         } else {
 737                                 /*
 738                                  * We may have raced with
 739                                  * segvn_reclaim()/segspt_reclaim(). In this
 740                                  * case clean nounmapwait flag and retry since
 741                                  * softlockcnt in this segment may be already
 742                                  * 0.  We don't drop as writer lock so our
 743                                  * number of retries without sleeping should
 744                                  * be very small. See segvn_reclaim() for
 745                                  * more comments.
 746                                  */
 747                                 AS_CLRNOUNMAPWAIT(as);
 748                                 mutex_exit(&as->a_contents);
 749                                 goto retry;
 750                         }
 751                         mutex_exit(&as->a_contents);
 752                         goto top;
 753                 } else {
 754                         /*
 755                          * We do not expect any other error return at this
 756                          * time. This is similar to an ASSERT in seg_unmap()
 757                          */
 758                         ASSERT(err == 0);
 759                 }
 760         }
 761         hat_free_end(hat);
 762         if (as->a_xhat != NULL)
 763                 xhat_free_end_all(as);
 764         AS_LOCK_EXIT(as, &as->a_lock);
 765 
 766         /* /proc stuff */
 767         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 768         if (as->a_objectdir) {
 769                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 770                 as->a_objectdir = NULL;
 771                 as->a_sizedir = 0;
 772         }
 773 
 774         /*
 775          * Free the struct as back to kmem.  Assert it has no segments.
 776          */
 777         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 778         kmem_cache_free(as_cache, as);
 779 }
 780 
 781 int
 782 as_dup(struct as *as, struct proc *forkedproc)
 783 {
 784         struct as *newas;
 785         struct seg *seg, *newseg;
 786         size_t  purgesize = 0;
 787         int error;
 788 
 789         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 790         as_clearwatch(as);
 791         newas = as_alloc();
 792         newas->a_userlimit = as->a_userlimit;
 793         newas->a_proc = forkedproc;
 794 
 795         AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
 796 
 797         /* This will prevent new XHATs from attaching */
 798         mutex_enter(&as->a_contents);
 799         AS_SETBUSY(as);
 800         mutex_exit(&as->a_contents);
 801         mutex_enter(&newas->a_contents);
 802         AS_SETBUSY(newas);
 803         mutex_exit(&newas->a_contents);
 804 
 805         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 806 
 807         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 808 
 809                 if (seg->s_flags & S_PURGE) {
 810                         purgesize += seg->s_size;
 811                         continue;
 812                 }
 813 
 814                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 815                 if (newseg == NULL) {
 816                         AS_LOCK_EXIT(newas, &newas->a_lock);
 817                         as_setwatch(as);
 818                         mutex_enter(&as->a_contents);
 819                         AS_CLRBUSY(as);
 820                         mutex_exit(&as->a_contents);
 821                         AS_LOCK_EXIT(as, &as->a_lock);
 822                         as_free(newas);
 823                         return (-1);
 824                 }
 825                 if ((error = segop_dup(seg, newseg)) != 0) {
 826                         /*
 827                          * We call seg_free() on the new seg
 828                          * because the segment is not set up
 829                          * completely; i.e. it has no ops.
 830                          */
 831                         as_setwatch(as);
 832                         mutex_enter(&as->a_contents);
 833                         AS_CLRBUSY(as);
 834                         mutex_exit(&as->a_contents);
 835                         AS_LOCK_EXIT(as, &as->a_lock);
 836                         seg_free(newseg);
 837                         AS_LOCK_EXIT(newas, &newas->a_lock);
 838                         as_free(newas);
 839                         return (error);
 840                 }
 841                 newas->a_size += seg->s_size;
 842         }
 843         newas->a_resvsize = as->a_resvsize - purgesize;
 844 
 845         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 846         if (as->a_xhat != NULL)
 847                 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
 848 
 849         mutex_enter(&newas->a_contents);
 850         AS_CLRBUSY(newas);
 851         mutex_exit(&newas->a_contents);
 852         AS_LOCK_EXIT(newas, &newas->a_lock);
 853 
 854         as_setwatch(as);
 855         mutex_enter(&as->a_contents);
 856         AS_CLRBUSY(as);
 857         mutex_exit(&as->a_contents);
 858         AS_LOCK_EXIT(as, &as->a_lock);
 859         if (error != 0) {
 860                 as_free(newas);
 861                 return (error);
 862         }
 863         forkedproc->p_as = newas;
 864         return (0);
 865 }
 866 
 867 /*
 868  * Handle a ``fault'' at addr for size bytes.
 869  */
 870 faultcode_t
 871 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 872         enum fault_type type, enum seg_rw rw)
 873 {
 874         struct seg *seg;
 875         caddr_t raddr;                  /* rounded down addr */
 876         size_t rsize;                   /* rounded up size */
 877         size_t ssize;
 878         faultcode_t res = 0;
 879         caddr_t addrsav;
 880         struct seg *segsav;
 881         int as_lock_held;
 882         klwp_t *lwp = ttolwp(curthread);
 883         int is_xhat = 0;
 884         int holding_wpage = 0;
 885 
 886         if (as->a_hat != hat) {
 887                 /* This must be an XHAT then */
 888                 is_xhat = 1;
 889 
 890                 if ((type != F_INVAL) || (as == &kas))
 891                         return (FC_NOSUPPORT);
 892         }
 893 
 894 retry:
 895         if (!is_xhat) {
 896                 /*
 897                  * Indicate that the lwp is not to be stopped while waiting
 898                  * for a pagefault.  This is to avoid deadlock while debugging
 899                  * a process via /proc over NFS (in particular).
 900                  */
 901                 if (lwp != NULL)
 902                         lwp->lwp_nostop++;
 903 
 904                 /*
 905                  * same length must be used when we softlock and softunlock.
 906                  * We don't support softunlocking lengths less than
 907                  * the original length when there is largepage support.
 908                  * See seg_dev.c for more comments.
 909                  */
 910                 switch (type) {
 911 
 912                 case F_SOFTLOCK:
 913                         CPU_STATS_ADD_K(vm, softlock, 1);
 914                         break;
 915 
 916                 case F_SOFTUNLOCK:
 917                         break;
 918 
 919                 case F_PROT:
 920                         CPU_STATS_ADD_K(vm, prot_fault, 1);
 921                         break;
 922 
 923                 case F_INVAL:
 924                         CPU_STATS_ENTER_K();
 925                         CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 926                         if (as == &kas)
 927                                 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 928                         CPU_STATS_EXIT_K();
 929                         break;
 930                 }
 931         }
 932 
 933         /* Kernel probe */
 934         TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 935             tnf_opaque, address,        addr,
 936             tnf_fault_type,     fault_type,     type,
 937             tnf_seg_access,     access,         rw);
 938 
 939         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 940         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 941             (size_t)raddr;
 942 
 943         /*
 944          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 945          * correctness, but then we could be stuck holding this lock for
 946          * a LONG time if the fault needs to be resolved on a slow
 947          * filesystem, and then no-one will be able to exec new commands,
 948          * as exec'ing requires the write lock on the as.
 949          */
 950         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 951             raddr + size < segkmap->s_base + segkmap->s_size) {
 952                 /*
 953                  * if (as==&kas), this can't be XHAT: we've already returned
 954                  * FC_NOSUPPORT.
 955                  */
 956                 seg = segkmap;
 957                 as_lock_held = 0;
 958         } else {
 959                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
 960                 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
 961                         /*
 962                          * Grab and hold the writers' lock on the as
 963                          * if the fault is to a watched page.
 964                          * This will keep CPUs from "peeking" at the
 965                          * address range while we're temporarily boosting
 966                          * the permissions for the XHAT device to
 967                          * resolve the fault in the segment layer.
 968                          *
 969                          * We could check whether faulted address
 970                          * is within a watched page and only then grab
 971                          * the writer lock, but this is simpler.
 972                          */
 973                         AS_LOCK_EXIT(as, &as->a_lock);
 974                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 975                 }
 976 
 977                 seg = as_segat(as, raddr);
 978                 if (seg == NULL) {
 979                         AS_LOCK_EXIT(as, &as->a_lock);
 980                         if ((lwp != NULL) && (!is_xhat))
 981                                 lwp->lwp_nostop--;
 982                         return (FC_NOMAP);
 983                 }
 984 
 985                 as_lock_held = 1;
 986         }
 987 
 988         addrsav = raddr;
 989         segsav = seg;
 990 
 991         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 992                 if (raddr >= seg->s_base + seg->s_size) {
 993                         seg = AS_SEGNEXT(as, seg);
 994                         if (seg == NULL || raddr != seg->s_base) {
 995                                 res = FC_NOMAP;
 996                                 break;
 997                         }
 998                 }
 999                 if (raddr + rsize > seg->s_base + seg->s_size)
1000                         ssize = seg->s_base + seg->s_size - raddr;
1001                 else
1002                         ssize = rsize;
1003 
1004                 if (!is_xhat || (seg->s_ops != &segdev_ops)) {
1005 
1006                         if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
1007                             pr_is_watchpage_as(raddr, rw, as)) {
1008                                 /*
1009                                  * Handle watch pages.  If we're faulting on a
1010                                  * watched page from an X-hat, we have to
1011                                  * restore the original permissions while we
1012                                  * handle the fault.
1013                                  */
1014                                 as_clearwatch(as);
1015                                 holding_wpage = 1;
1016                         }
1017 
1018                         res = segop_fault(hat, seg, raddr, ssize, type, rw);
1019 
1020                         /* Restore watchpoints */
1021                         if (holding_wpage) {
1022                                 as_setwatch(as);
1023                                 holding_wpage = 0;
1024                         }
1025 
1026                         if (res != 0)
1027                                 break;
1028                 } else {
1029                         /* XHAT does not support seg_dev */
1030                         res = FC_NOSUPPORT;
1031                         break;
1032                 }
1033         }
1034 
1035         /*
1036          * If we were SOFTLOCKing and encountered a failure,
1037          * we must SOFTUNLOCK the range we already did. (Maybe we
1038          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
1039          * right here...)
1040          */
1041         if (res != 0 && type == F_SOFTLOCK) {
1042                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1043                         if (addrsav >= seg->s_base + seg->s_size)
1044                                 seg = AS_SEGNEXT(as, seg);
1045                         ASSERT(seg != NULL);
1046                         /*
1047                          * Now call the fault routine again to perform the
1048                          * unlock using S_OTHER instead of the rw variable
1049                          * since we never got a chance to touch the pages.
1050                          */
1051                         if (raddr > seg->s_base + seg->s_size)
1052                                 ssize = seg->s_base + seg->s_size - addrsav;
1053                         else
1054                                 ssize = raddr - addrsav;
1055                         (void) segop_fault(hat, seg, addrsav, ssize,
1056                             F_SOFTUNLOCK, S_OTHER);
1057                 }
1058         }
1059         if (as_lock_held)
1060                 AS_LOCK_EXIT(as, &as->a_lock);
1061         if ((lwp != NULL) && (!is_xhat))
1062                 lwp->lwp_nostop--;
1063 
1064         /*
1065          * If the lower levels returned EDEADLK for a fault,
1066          * It means that we should retry the fault.  Let's wait
1067          * a bit also to let the deadlock causing condition clear.
1068          * This is part of a gross hack to work around a design flaw
1069          * in the ufs/sds logging code and should go away when the
1070          * logging code is re-designed to fix the problem. See bug
1071          * 4125102 for details of the problem.
1072          */
1073         if (FC_ERRNO(res) == EDEADLK) {
1074                 delay(deadlk_wait);
1075                 res = 0;
1076                 goto retry;
1077         }
1078         return (res);
1079 }
1080 
1081 
1082 
1083 /*
1084  * Asynchronous ``fault'' at addr for size bytes.
1085  */
1086 faultcode_t
1087 as_faulta(struct as *as, caddr_t addr, size_t size)
1088 {
1089         struct seg *seg;
1090         caddr_t raddr;                  /* rounded down addr */
1091         size_t rsize;                   /* rounded up size */
1092         faultcode_t res = 0;
1093         klwp_t *lwp = ttolwp(curthread);
1094 
1095 retry:
1096         /*
1097          * Indicate that the lwp is not to be stopped while waiting
1098          * for a pagefault.  This is to avoid deadlock while debugging
1099          * a process via /proc over NFS (in particular).
1100          */
1101         if (lwp != NULL)
1102                 lwp->lwp_nostop++;
1103 
1104         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1105         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1106             (size_t)raddr;
1107 
1108         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1109         seg = as_segat(as, raddr);
1110         if (seg == NULL) {
1111                 AS_LOCK_EXIT(as, &as->a_lock);
1112                 if (lwp != NULL)
1113                         lwp->lwp_nostop--;
1114                 return (FC_NOMAP);
1115         }
1116 
1117         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1118                 if (raddr >= seg->s_base + seg->s_size) {
1119                         seg = AS_SEGNEXT(as, seg);
1120                         if (seg == NULL || raddr != seg->s_base) {
1121                                 res = FC_NOMAP;
1122                                 break;
1123                         }
1124                 }
1125                 res = segop_faulta(seg, raddr);
1126                 if (res != 0)
1127                         break;
1128         }
1129         AS_LOCK_EXIT(as, &as->a_lock);
1130         if (lwp != NULL)
1131                 lwp->lwp_nostop--;
1132         /*
1133          * If the lower levels returned EDEADLK for a fault,
1134          * It means that we should retry the fault.  Let's wait
1135          * a bit also to let the deadlock causing condition clear.
1136          * This is part of a gross hack to work around a design flaw
1137          * in the ufs/sds logging code and should go away when the
1138          * logging code is re-designed to fix the problem. See bug
1139          * 4125102 for details of the problem.
1140          */
1141         if (FC_ERRNO(res) == EDEADLK) {
1142                 delay(deadlk_wait);
1143                 res = 0;
1144                 goto retry;
1145         }
1146         return (res);
1147 }
1148 
1149 /*
1150  * Set the virtual mapping for the interval from [addr : addr + size)
1151  * in address space `as' to have the specified protection.
1152  * It is ok for the range to cross over several segments,
1153  * as long as they are contiguous.
1154  */
1155 int
1156 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1157 {
1158         struct seg *seg;
1159         struct as_callback *cb;
1160         size_t ssize;
1161         caddr_t raddr;                  /* rounded down addr */
1162         size_t rsize;                   /* rounded up size */
1163         int error = 0, writer = 0;
1164         caddr_t saveraddr;
1165         size_t saversize;
1166 
1167 setprot_top:
1168         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1169         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1170             (size_t)raddr;
1171 
1172         if (raddr + rsize < raddr)           /* check for wraparound */
1173                 return (ENOMEM);
1174 
1175         saveraddr = raddr;
1176         saversize = rsize;
1177 
1178         /*
1179          * Normally we only lock the as as a reader. But
1180          * if due to setprot the segment driver needs to split
1181          * a segment it will return IE_RETRY. Therefore we re-acquire
1182          * the as lock as a writer so the segment driver can change
1183          * the seg list. Also the segment driver will return IE_RETRY
1184          * after it has changed the segment list so we therefore keep
1185          * locking as a writer. Since these opeartions should be rare
1186          * want to only lock as a writer when necessary.
1187          */
1188         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1189                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1190         } else {
1191                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1192         }
1193 
1194         as_clearwatchprot(as, raddr, rsize);
1195         seg = as_segat(as, raddr);
1196         if (seg == NULL) {
1197                 as_setwatch(as);
1198                 AS_LOCK_EXIT(as, &as->a_lock);
1199                 return (ENOMEM);
1200         }
1201 
1202         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1203                 if (raddr >= seg->s_base + seg->s_size) {
1204                         seg = AS_SEGNEXT(as, seg);
1205                         if (seg == NULL || raddr != seg->s_base) {
1206                                 error = ENOMEM;
1207                                 break;
1208                         }
1209                 }
1210                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1211                         ssize = seg->s_base + seg->s_size - raddr;
1212                 else
1213                         ssize = rsize;
1214 retry:
1215                 error = segop_setprot(seg, raddr, ssize, prot);
1216 
1217                 if (error == IE_NOMEM) {
1218                         error = EAGAIN;
1219                         break;
1220                 }
1221 
1222                 if (error == IE_RETRY) {
1223                         AS_LOCK_EXIT(as, &as->a_lock);
1224                         writer = 1;
1225                         goto setprot_top;
1226                 }
1227 
1228                 if (error == EAGAIN) {
1229                         /*
1230                          * Make sure we have a_lock as writer.
1231                          */
1232                         if (writer == 0) {
1233                                 AS_LOCK_EXIT(as, &as->a_lock);
1234                                 writer = 1;
1235                                 goto setprot_top;
1236                         }
1237 
1238                         /*
1239                          * Memory is currently locked.  It must be unlocked
1240                          * before this operation can succeed through a retry.
1241                          * The possible reasons for locked memory and
1242                          * corresponding strategies for unlocking are:
1243                          * (1) Normal I/O
1244                          *      wait for a signal that the I/O operation
1245                          *      has completed and the memory is unlocked.
1246                          * (2) Asynchronous I/O
1247                          *      The aio subsystem does not unlock pages when
1248                          *      the I/O is completed. Those pages are unlocked
1249                          *      when the application calls aiowait/aioerror.
1250                          *      So, to prevent blocking forever, cv_broadcast()
1251                          *      is done to wake up aio_cleanup_thread.
1252                          *      Subsequently, segvn_reclaim will be called, and
1253                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1254                          * (3) Long term page locking:
1255                          *      Drivers intending to have pages locked for a
1256                          *      period considerably longer than for normal I/O
1257                          *      (essentially forever) may have registered for a
1258                          *      callback so they may unlock these pages on
1259                          *      request. This is needed to allow this operation
1260                          *      to succeed. Each entry on the callback list is
1261                          *      examined. If the event or address range pertains
1262                          *      the callback is invoked (unless it already is in
1263                          *      progress). The a_contents lock must be dropped
1264                          *      before the callback, so only one callback can
1265                          *      be done at a time. Go to the top and do more
1266                          *      until zero is returned. If zero is returned,
1267                          *      either there were no callbacks for this event
1268                          *      or they were already in progress.
1269                          */
1270                         mutex_enter(&as->a_contents);
1271                         if (as->a_callbacks &&
1272                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1273                             seg->s_base, seg->s_size))) {
1274                                 AS_LOCK_EXIT(as, &as->a_lock);
1275                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1276                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1277                                 if (AS_ISUNMAPWAIT(as) == 0)
1278                                         cv_broadcast(&as->a_cv);
1279                                 AS_SETUNMAPWAIT(as);
1280                                 AS_LOCK_EXIT(as, &as->a_lock);
1281                                 while (AS_ISUNMAPWAIT(as))
1282                                         cv_wait(&as->a_cv, &as->a_contents);
1283                         } else {
1284                                 /*
1285                                  * We may have raced with
1286                                  * segvn_reclaim()/segspt_reclaim(). In this
1287                                  * case clean nounmapwait flag and retry since
1288                                  * softlockcnt in this segment may be already
1289                                  * 0.  We don't drop as writer lock so our
1290                                  * number of retries without sleeping should
1291                                  * be very small. See segvn_reclaim() for
1292                                  * more comments.
1293                                  */
1294                                 AS_CLRNOUNMAPWAIT(as);
1295                                 mutex_exit(&as->a_contents);
1296                                 goto retry;
1297                         }
1298                         mutex_exit(&as->a_contents);
1299                         goto setprot_top;
1300                 } else if (error != 0)
1301                         break;
1302         }
1303         if (error != 0) {
1304                 as_setwatch(as);
1305         } else {
1306                 as_setwatchprot(as, saveraddr, saversize, prot);
1307         }
1308         AS_LOCK_EXIT(as, &as->a_lock);
1309         return (error);
1310 }
1311 
1312 /*
1313  * Check to make sure that the interval [addr, addr + size)
1314  * in address space `as' has at least the specified protection.
1315  * It is ok for the range to cross over several segments, as long
1316  * as they are contiguous.
1317  */
1318 int
1319 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1320 {
1321         struct seg *seg;
1322         size_t ssize;
1323         caddr_t raddr;                  /* rounded down addr */
1324         size_t rsize;                   /* rounded up size */
1325         int error = 0;
1326 
1327         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1328         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1329             (size_t)raddr;
1330 
1331         if (raddr + rsize < raddr)           /* check for wraparound */
1332                 return (ENOMEM);
1333 
1334         /*
1335          * This is ugly as sin...
1336          * Normally, we only acquire the address space readers lock.
1337          * However, if the address space has watchpoints present,
1338          * we must acquire the writer lock on the address space for
1339          * the benefit of as_clearwatchprot() and as_setwatchprot().
1340          */
1341         if (avl_numnodes(&as->a_wpage) != 0)
1342                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1343         else
1344                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1345         as_clearwatchprot(as, raddr, rsize);
1346         seg = as_segat(as, raddr);
1347         if (seg == NULL) {
1348                 as_setwatch(as);
1349                 AS_LOCK_EXIT(as, &as->a_lock);
1350                 return (ENOMEM);
1351         }
1352 
1353         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1354                 if (raddr >= seg->s_base + seg->s_size) {
1355                         seg = AS_SEGNEXT(as, seg);
1356                         if (seg == NULL || raddr != seg->s_base) {
1357                                 error = ENOMEM;
1358                                 break;
1359                         }
1360                 }
1361                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1362                         ssize = seg->s_base + seg->s_size - raddr;
1363                 else
1364                         ssize = rsize;
1365 
1366                 error = segop_checkprot(seg, raddr, ssize, prot);
1367                 if (error != 0)
1368                         break;
1369         }
1370         as_setwatch(as);
1371         AS_LOCK_EXIT(as, &as->a_lock);
1372         return (error);
1373 }
1374 
1375 int
1376 as_unmap(struct as *as, caddr_t addr, size_t size)
1377 {
1378         struct seg *seg, *seg_next;
1379         struct as_callback *cb;
1380         caddr_t raddr, eaddr;
1381         size_t ssize, rsize = 0;
1382         int err;
1383 
1384 top:
1385         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1386         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1387             (uintptr_t)PAGEMASK);
1388 
1389         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1390 
1391         as->a_updatedir = 1; /* inform /proc */
1392         gethrestime(&as->a_updatetime);
1393 
1394         /*
1395          * Use as_findseg to find the first segment in the range, then
1396          * step through the segments in order, following s_next.
1397          */
1398         as_clearwatchprot(as, raddr, eaddr - raddr);
1399 
1400         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1401                 if (eaddr <= seg->s_base)
1402                         break;          /* eaddr was in a gap; all done */
1403 
1404                 /* this is implied by the test above */
1405                 ASSERT(raddr < eaddr);
1406 
1407                 if (raddr < seg->s_base)
1408                         raddr = seg->s_base;         /* raddr was in a gap */
1409 
1410                 if (eaddr > (seg->s_base + seg->s_size))
1411                         ssize = seg->s_base + seg->s_size - raddr;
1412                 else
1413                         ssize = eaddr - raddr;
1414 
1415                 /*
1416                  * Save next segment pointer since seg can be
1417                  * destroyed during the segment unmap operation.
1418                  */
1419                 seg_next = AS_SEGNEXT(as, seg);
1420 
1421                 /*
1422                  * We didn't count /dev/null mappings, so ignore them here.
1423                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1424                  * we have to do this check here while we have seg.)
1425                  */
1426                 rsize = 0;
1427                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1428                     !SEG_IS_PARTIAL_RESV(seg))
1429                         rsize = ssize;
1430 
1431 retry:
1432                 err = segop_unmap(seg, raddr, ssize);
1433                 if (err == EAGAIN) {
1434                         /*
1435                          * Memory is currently locked.  It must be unlocked
1436                          * before this operation can succeed through a retry.
1437                          * The possible reasons for locked memory and
1438                          * corresponding strategies for unlocking are:
1439                          * (1) Normal I/O
1440                          *      wait for a signal that the I/O operation
1441                          *      has completed and the memory is unlocked.
1442                          * (2) Asynchronous I/O
1443                          *      The aio subsystem does not unlock pages when
1444                          *      the I/O is completed. Those pages are unlocked
1445                          *      when the application calls aiowait/aioerror.
1446                          *      So, to prevent blocking forever, cv_broadcast()
1447                          *      is done to wake up aio_cleanup_thread.
1448                          *      Subsequently, segvn_reclaim will be called, and
1449                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1450                          * (3) Long term page locking:
1451                          *      Drivers intending to have pages locked for a
1452                          *      period considerably longer than for normal I/O
1453                          *      (essentially forever) may have registered for a
1454                          *      callback so they may unlock these pages on
1455                          *      request. This is needed to allow this operation
1456                          *      to succeed. Each entry on the callback list is
1457                          *      examined. If the event or address range pertains
1458                          *      the callback is invoked (unless it already is in
1459                          *      progress). The a_contents lock must be dropped
1460                          *      before the callback, so only one callback can
1461                          *      be done at a time. Go to the top and do more
1462                          *      until zero is returned. If zero is returned,
1463                          *      either there were no callbacks for this event
1464                          *      or they were already in progress.
1465                          */
1466                         mutex_enter(&as->a_contents);
1467                         if (as->a_callbacks &&
1468                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1469                             seg->s_base, seg->s_size))) {
1470                                 AS_LOCK_EXIT(as, &as->a_lock);
1471                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1472                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1473                                 if (AS_ISUNMAPWAIT(as) == 0)
1474                                         cv_broadcast(&as->a_cv);
1475                                 AS_SETUNMAPWAIT(as);
1476                                 AS_LOCK_EXIT(as, &as->a_lock);
1477                                 while (AS_ISUNMAPWAIT(as))
1478                                         cv_wait(&as->a_cv, &as->a_contents);
1479                         } else {
1480                                 /*
1481                                  * We may have raced with
1482                                  * segvn_reclaim()/segspt_reclaim(). In this
1483                                  * case clean nounmapwait flag and retry since
1484                                  * softlockcnt in this segment may be already
1485                                  * 0.  We don't drop as writer lock so our
1486                                  * number of retries without sleeping should
1487                                  * be very small. See segvn_reclaim() for
1488                                  * more comments.
1489                                  */
1490                                 AS_CLRNOUNMAPWAIT(as);
1491                                 mutex_exit(&as->a_contents);
1492                                 goto retry;
1493                         }
1494                         mutex_exit(&as->a_contents);
1495                         goto top;
1496                 } else if (err == IE_RETRY) {
1497                         AS_LOCK_EXIT(as, &as->a_lock);
1498                         goto top;
1499                 } else if (err) {
1500                         as_setwatch(as);
1501                         AS_LOCK_EXIT(as, &as->a_lock);
1502                         return (-1);
1503                 }
1504 
1505                 as->a_size -= ssize;
1506                 if (rsize)
1507                         as->a_resvsize -= rsize;
1508                 raddr += ssize;
1509         }
1510         AS_LOCK_EXIT(as, &as->a_lock);
1511         return (0);
1512 }
1513 
1514 static int
1515 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1516     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1517 {
1518         uint_t szc;
1519         uint_t nszc;
1520         int error;
1521         caddr_t a;
1522         caddr_t eaddr;
1523         size_t segsize;
1524         struct seg *seg;
1525         size_t pgsz;
1526         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1527         uint_t save_szcvec;
1528 
1529         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1530         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1531         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1532         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1533         if (!do_off) {
1534                 vn_a->offset = 0;
1535         }
1536 
1537         if (szcvec <= 1) {
1538                 seg = seg_alloc(as, addr, size);
1539                 if (seg == NULL) {
1540                         return (ENOMEM);
1541                 }
1542                 vn_a->szc = 0;
1543                 error = (*crfp)(seg, vn_a);
1544                 if (error != 0) {
1545                         seg_free(seg);
1546                 } else {
1547                         as->a_size += size;
1548                         as->a_resvsize += size;
1549                 }
1550                 return (error);
1551         }
1552 
1553         eaddr = addr + size;
1554         save_szcvec = szcvec;
1555         szcvec >>= 1;
1556         szc = 0;
1557         nszc = 0;
1558         while (szcvec) {
1559                 if ((szcvec & 0x1) == 0) {
1560                         nszc++;
1561                         szcvec >>= 1;
1562                         continue;
1563                 }
1564                 nszc++;
1565                 pgsz = page_get_pagesize(nszc);
1566                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1567                 if (a != addr) {
1568                         ASSERT(a < eaddr);
1569                         segsize = a - addr;
1570                         seg = seg_alloc(as, addr, segsize);
1571                         if (seg == NULL) {
1572                                 return (ENOMEM);
1573                         }
1574                         vn_a->szc = szc;
1575                         error = (*crfp)(seg, vn_a);
1576                         if (error != 0) {
1577                                 seg_free(seg);
1578                                 return (error);
1579                         }
1580                         as->a_size += segsize;
1581                         as->a_resvsize += segsize;
1582                         *segcreated = 1;
1583                         if (do_off) {
1584                                 vn_a->offset += segsize;
1585                         }
1586                         addr = a;
1587                 }
1588                 szc = nszc;
1589                 szcvec >>= 1;
1590         }
1591 
1592         ASSERT(addr < eaddr);
1593         szcvec = save_szcvec | 1; /* add 8K pages */
1594         while (szcvec) {
1595                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1596                 ASSERT(a >= addr);
1597                 if (a != addr) {
1598                         segsize = a - addr;
1599                         seg = seg_alloc(as, addr, segsize);
1600                         if (seg == NULL) {
1601                                 return (ENOMEM);
1602                         }
1603                         vn_a->szc = szc;
1604                         error = (*crfp)(seg, vn_a);
1605                         if (error != 0) {
1606                                 seg_free(seg);
1607                                 return (error);
1608                         }
1609                         as->a_size += segsize;
1610                         as->a_resvsize += segsize;
1611                         *segcreated = 1;
1612                         if (do_off) {
1613                                 vn_a->offset += segsize;
1614                         }
1615                         addr = a;
1616                 }
1617                 szcvec &= ~(1 << szc);
1618                 if (szcvec) {
1619                         szc = highbit(szcvec) - 1;
1620                         pgsz = page_get_pagesize(szc);
1621                 }
1622         }
1623         ASSERT(addr == eaddr);
1624 
1625         return (0);
1626 }
1627 
1628 static int
1629 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1630     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1631 {
1632         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1633         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1634         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1635             type, 0);
1636         int error;
1637         struct seg *seg;
1638         struct vattr va;
1639         u_offset_t eoff;
1640         size_t save_size = 0;
1641         extern size_t textrepl_size_thresh;
1642 
1643         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1644         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1645         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1646         ASSERT(vn_a->vp != NULL);
1647         ASSERT(vn_a->amp == NULL);
1648 
1649 again:
1650         if (szcvec <= 1) {
1651                 seg = seg_alloc(as, addr, size);
1652                 if (seg == NULL) {
1653                         return (ENOMEM);
1654                 }
1655                 vn_a->szc = 0;
1656                 error = (*crfp)(seg, vn_a);
1657                 if (error != 0) {
1658                         seg_free(seg);
1659                 } else {
1660                         as->a_size += size;
1661                         as->a_resvsize += size;
1662                 }
1663                 return (error);
1664         }
1665 
1666         va.va_mask = AT_SIZE;
1667         if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1668                 szcvec = 0;
1669                 goto again;
1670         }
1671         eoff = vn_a->offset & PAGEMASK;
1672         if (eoff >= va.va_size) {
1673                 szcvec = 0;
1674                 goto again;
1675         }
1676         eoff += size;
1677         if (btopr(va.va_size) < btopr(eoff)) {
1678                 save_size = size;
1679                 size = va.va_size - (vn_a->offset & PAGEMASK);
1680                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1681                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1682                     type, 0);
1683                 if (szcvec <= 1) {
1684                         size = save_size;
1685                         goto again;
1686                 }
1687         }
1688 
1689         if (size > textrepl_size_thresh) {
1690                 vn_a->flags |= _MAP_TEXTREPL;
1691         }
1692         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1693             segcreated);
1694         if (error != 0) {
1695                 return (error);
1696         }
1697         if (save_size) {
1698                 addr += size;
1699                 size = save_size - size;
1700                 szcvec = 0;
1701                 goto again;
1702         }
1703         return (0);
1704 }
1705 
1706 /*
1707  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1708  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1709  */
1710 static int
1711 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1712     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1713 {
1714         uint_t szcvec;
1715         uchar_t type;
1716 
1717         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1718         if (vn_a->type == MAP_SHARED) {
1719                 type = MAPPGSZC_SHM;
1720         } else if (vn_a->type == MAP_PRIVATE) {
1721                 if (vn_a->szc == AS_MAP_HEAP) {
1722                         type = MAPPGSZC_HEAP;
1723                 } else if (vn_a->szc == AS_MAP_STACK) {
1724                         type = MAPPGSZC_STACK;
1725                 } else {
1726                         type = MAPPGSZC_PRIVM;
1727                 }
1728         }
1729         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1730             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1731             (vn_a->flags & MAP_TEXT), type, 0);
1732         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1733         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1734         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1735         ASSERT(vn_a->vp == NULL);
1736 
1737         return (as_map_segvn_segs(as, addr, size, szcvec,
1738             crfp, vn_a, segcreated));
1739 }
1740 
1741 int
1742 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1743 {
1744         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1745         return (as_map_locked(as, addr, size, crfp, argsp));
1746 }
1747 
1748 int
1749 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1750                 void *argsp)
1751 {
1752         struct seg *seg = NULL;
1753         caddr_t raddr;                  /* rounded down addr */
1754         size_t rsize;                   /* rounded up size */
1755         int error;
1756         int unmap = 0;
1757         struct proc *p = curproc;
1758         struct segvn_crargs crargs;
1759 
1760         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1761         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1762             (size_t)raddr;
1763 
1764         /*
1765          * check for wrap around
1766          */
1767         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1768                 AS_LOCK_EXIT(as, &as->a_lock);
1769                 return (ENOMEM);
1770         }
1771 
1772         as->a_updatedir = 1; /* inform /proc */
1773         gethrestime(&as->a_updatetime);
1774 
1775         if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1776                 AS_LOCK_EXIT(as, &as->a_lock);
1777 
1778                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1779                     RCA_UNSAFE_ALL);
1780 
1781                 return (ENOMEM);
1782         }
1783 
1784         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1785                 crargs = *(struct segvn_crargs *)argsp;
1786                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1787                 if (error != 0) {
1788                         AS_LOCK_EXIT(as, &as->a_lock);
1789                         if (unmap) {
1790                                 (void) as_unmap(as, addr, size);
1791                         }
1792                         return (error);
1793                 }
1794         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1795                 crargs = *(struct segvn_crargs *)argsp;
1796                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1797                 if (error != 0) {
1798                         AS_LOCK_EXIT(as, &as->a_lock);
1799                         if (unmap) {
1800                                 (void) as_unmap(as, addr, size);
1801                         }
1802                         return (error);
1803                 }
1804         } else {
1805                 seg = seg_alloc(as, addr, size);
1806                 if (seg == NULL) {
1807                         AS_LOCK_EXIT(as, &as->a_lock);
1808                         return (ENOMEM);
1809                 }
1810 
1811                 error = (*crfp)(seg, argsp);
1812                 if (error != 0) {
1813                         seg_free(seg);
1814                         AS_LOCK_EXIT(as, &as->a_lock);
1815                         return (error);
1816                 }
1817                 /*
1818                  * Add size now so as_unmap will work if as_ctl fails.
1819                  */
1820                 as->a_size += rsize;
1821                 as->a_resvsize += rsize;
1822         }
1823 
1824         as_setwatch(as);
1825 
1826         /*
1827          * If the address space is locked,
1828          * establish memory locks for the new segment.
1829          */
1830         mutex_enter(&as->a_contents);
1831         if (AS_ISPGLCK(as)) {
1832                 mutex_exit(&as->a_contents);
1833                 AS_LOCK_EXIT(as, &as->a_lock);
1834                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1835                 if (error != 0)
1836                         (void) as_unmap(as, addr, size);
1837         } else {
1838                 mutex_exit(&as->a_contents);
1839                 AS_LOCK_EXIT(as, &as->a_lock);
1840         }
1841         return (error);
1842 }
1843 
1844 
1845 /*
1846  * Delete all segments in the address space marked with S_PURGE.
1847  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1848  * These segments are deleted as a first step before calls to as_gap(), so
1849  * that they don't affect mmap() or shmat().
1850  */
1851 void
1852 as_purge(struct as *as)
1853 {
1854         struct seg *seg;
1855         struct seg *next_seg;
1856 
1857         /*
1858          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1859          * no need to grab a_contents mutex for this check
1860          */
1861         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1862                 return;
1863 
1864         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1865         next_seg = NULL;
1866         seg = AS_SEGFIRST(as);
1867         while (seg != NULL) {
1868                 next_seg = AS_SEGNEXT(as, seg);
1869                 if (seg->s_flags & S_PURGE)
1870                         (void) segop_unmap(seg, seg->s_base, seg->s_size);
1871                 seg = next_seg;
1872         }
1873         AS_LOCK_EXIT(as, &as->a_lock);
1874 
1875         mutex_enter(&as->a_contents);
1876         as->a_flags &= ~AS_NEEDSPURGE;
1877         mutex_exit(&as->a_contents);
1878 }
1879 
1880 /*
1881  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1882  * range of addresses at least "minlen" long, where the base of the range is
1883  * at "off" phase from an "align" boundary and there is space for a
1884  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1885  * if align was 4M and off was 16k, the user wants a hole which will start
1886  * 16k into a 4M page.
1887  *
1888  * If flags specifies AH_HI, the hole will have the highest possible address
1889  * in the range.  We use the as->a_lastgap field to figure out where to
1890  * start looking for a gap.
1891  *
1892  * Otherwise, the gap will have the lowest possible address.
1893  *
1894  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1895  *
1896  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1897  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1898  *
1899  * NOTE: This routine is not correct when base+len overflows caddr_t.
1900  */
1901 int
1902 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1903     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1904 {
1905         caddr_t lobound = *basep;
1906         caddr_t hibound = lobound + *lenp;
1907         struct seg *lseg, *hseg;
1908         caddr_t lo, hi;
1909         int forward;
1910         caddr_t save_base;
1911         size_t save_len;
1912         size_t save_minlen;
1913         size_t save_redzone;
1914         int fast_path = 1;
1915 
1916         save_base = *basep;
1917         save_len = *lenp;
1918         save_minlen = minlen;
1919         save_redzone = redzone;
1920 
1921         /*
1922          * For the first pass/fast_path, just add align and redzone into
1923          * minlen since if we get an allocation, we can guarantee that it
1924          * will fit the alignment and redzone requested.
1925          * This increases the chance that hibound will be adjusted to
1926          * a_lastgap->s_base which will likely allow us to find an
1927          * acceptable hole in the address space quicker.
1928          * If we can't find a hole with this fast_path, then we look for
1929          * smaller holes in which the alignment and offset may allow
1930          * the allocation to fit.
1931          */
1932         minlen += align;
1933         minlen += 2 * redzone;
1934         redzone = 0;
1935 
1936         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1937         if (AS_SEGFIRST(as) == NULL) {
1938                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1939                     align, redzone, off)) {
1940                         AS_LOCK_EXIT(as, &as->a_lock);
1941                         return (0);
1942                 } else {
1943                         AS_LOCK_EXIT(as, &as->a_lock);
1944                         *basep = save_base;
1945                         *lenp = save_len;
1946                         return (-1);
1947                 }
1948         }
1949 
1950 retry:
1951         /*
1952          * Set up to iterate over all the inter-segment holes in the given
1953          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1954          * NULL for the highest-addressed hole.  If moving backwards, we reset
1955          * sseg to denote the highest-addressed segment.
1956          */
1957         forward = (flags & AH_DIR) == AH_LO;
1958         if (forward) {
1959                 hseg = as_findseg(as, lobound, 1);
1960                 lseg = AS_SEGPREV(as, hseg);
1961         } else {
1962 
1963                 /*
1964                  * If allocating at least as much as the last allocation,
1965                  * use a_lastgap's base as a better estimate of hibound.
1966                  */
1967                 if (as->a_lastgap &&
1968                     minlen >= as->a_lastgap->s_size &&
1969                     hibound >= as->a_lastgap->s_base)
1970                         hibound = as->a_lastgap->s_base;
1971 
1972                 hseg = as_findseg(as, hibound, 1);
1973                 if (hseg->s_base + hseg->s_size < hibound) {
1974                         lseg = hseg;
1975                         hseg = NULL;
1976                 } else {
1977                         lseg = AS_SEGPREV(as, hseg);
1978                 }
1979         }
1980 
1981         for (;;) {
1982                 /*
1983                  * Set lo and hi to the hole's boundaries.  (We should really
1984                  * use MAXADDR in place of hibound in the expression below,
1985                  * but can't express it easily; using hibound in its place is
1986                  * harmless.)
1987                  */
1988                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1989                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1990                 /*
1991                  * If the iteration has moved past the interval from lobound
1992                  * to hibound it's pointless to continue.
1993                  */
1994                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1995                         break;
1996                 else if (lo > hibound || hi < lobound)
1997                         goto cont;
1998                 /*
1999                  * Candidate hole lies at least partially within the allowable
2000                  * range.  Restrict it to fall completely within that range,
2001                  * i.e., to [max(lo, lobound), min(hi, hibound)].
2002                  */
2003                 if (lo < lobound)
2004                         lo = lobound;
2005                 if (hi > hibound)
2006                         hi = hibound;
2007                 /*
2008                  * Verify that the candidate hole is big enough and meets
2009                  * hardware constraints.  If the hole is too small, no need
2010                  * to do the further checks since they will fail.
2011                  */
2012                 *basep = lo;
2013                 *lenp = hi - lo;
2014                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2015                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2016                     ((flags & AH_CONTAIN) == 0 ||
2017                     (*basep <= addr && *basep + *lenp > addr))) {
2018                         if (!forward)
2019                                 as->a_lastgap = hseg;
2020                         if (hseg != NULL)
2021                                 as->a_lastgaphl = hseg;
2022                         else
2023                                 as->a_lastgaphl = lseg;
2024                         AS_LOCK_EXIT(as, &as->a_lock);
2025                         return (0);
2026                 }
2027         cont:
2028                 /*
2029                  * Move to the next hole.
2030                  */
2031                 if (forward) {
2032                         lseg = hseg;
2033                         if (lseg == NULL)
2034                                 break;
2035                         hseg = AS_SEGNEXT(as, hseg);
2036                 } else {
2037                         hseg = lseg;
2038                         if (hseg == NULL)
2039                                 break;
2040                         lseg = AS_SEGPREV(as, lseg);
2041                 }
2042         }
2043         if (fast_path && (align != 0 || save_redzone != 0)) {
2044                 fast_path = 0;
2045                 minlen = save_minlen;
2046                 redzone = save_redzone;
2047                 goto retry;
2048         }
2049         *basep = save_base;
2050         *lenp = save_len;
2051         AS_LOCK_EXIT(as, &as->a_lock);
2052         return (-1);
2053 }
2054 
2055 /*
2056  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2057  *
2058  * If flags specifies AH_HI, the hole will have the highest possible address
2059  * in the range.  We use the as->a_lastgap field to figure out where to
2060  * start looking for a gap.
2061  *
2062  * Otherwise, the gap will have the lowest possible address.
2063  *
2064  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2065  *
2066  * If an adequate hole is found, base and len are set to reflect the part of
2067  * the hole that is within range, and 0 is returned, otherwise,
2068  * -1 is returned.
2069  *
2070  * NOTE: This routine is not correct when base+len overflows caddr_t.
2071  */
2072 int
2073 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2074     caddr_t addr)
2075 {
2076 
2077         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2078 }
2079 
2080 /*
2081  * Return the next range within [base, base + len) that is backed
2082  * with "real memory".  Skip holes and non-seg_vn segments.
2083  * We're lazy and only return one segment at a time.
2084  */
2085 int
2086 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2087 {
2088         extern const struct seg_ops segspt_shmops; /* needs a header file */
2089         struct seg *seg;
2090         caddr_t addr, eaddr;
2091         caddr_t segend;
2092 
2093         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2094 
2095         addr = *basep;
2096         eaddr = addr + *lenp;
2097 
2098         seg = as_findseg(as, addr, 0);
2099         if (seg != NULL)
2100                 addr = MAX(seg->s_base, addr);
2101 
2102         for (;;) {
2103                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2104                         AS_LOCK_EXIT(as, &as->a_lock);
2105                         return (EINVAL);
2106                 }
2107 
2108                 if (seg->s_ops == &segvn_ops) {
2109                         segend = seg->s_base + seg->s_size;
2110                         break;
2111                 }
2112 
2113                 /*
2114                  * We do ISM by looking into the private data
2115                  * to determine the real size of the segment.
2116                  */
2117                 if (seg->s_ops == &segspt_shmops) {
2118                         segend = seg->s_base + spt_realsize(seg);
2119                         if (addr < segend)
2120                                 break;
2121                 }
2122 
2123                 seg = AS_SEGNEXT(as, seg);
2124 
2125                 if (seg != NULL)
2126                         addr = seg->s_base;
2127         }
2128 
2129         *basep = addr;
2130 
2131         if (segend > eaddr)
2132                 *lenp = eaddr - addr;
2133         else
2134                 *lenp = segend - addr;
2135 
2136         AS_LOCK_EXIT(as, &as->a_lock);
2137         return (0);
2138 }
2139 
2140 /*
2141  * Swap the pages associated with the address space as out to
2142  * secondary storage, returning the number of bytes actually
2143  * swapped.
2144  *
2145  * The value returned is intended to correlate well with the process's
2146  * memory requirements.  Its usefulness for this purpose depends on
2147  * how well the segment-level routines do at returning accurate
2148  * information.
2149  */
2150 size_t
2151 as_swapout(struct as *as)
2152 {
2153         struct seg *seg;
2154         size_t swpcnt = 0;
2155 
2156         /*
2157          * Kernel-only processes have given up their address
2158          * spaces.  Of course, we shouldn't be attempting to
2159          * swap out such processes in the first place...
2160          */
2161         if (as == NULL)
2162                 return (0);
2163 
2164         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2165 
2166         /* Prevent XHATs from attaching */
2167         mutex_enter(&as->a_contents);
2168         AS_SETBUSY(as);
2169         mutex_exit(&as->a_contents);
2170 
2171 
2172         /*
2173          * Free all mapping resources associated with the address
2174          * space.  The segment-level swapout routines capitalize
2175          * on this unmapping by scavanging pages that have become
2176          * unmapped here.
2177          */
2178         hat_swapout(as->a_hat);
2179         if (as->a_xhat != NULL)
2180                 xhat_swapout_all(as);
2181 
2182         mutex_enter(&as->a_contents);
2183         AS_CLRBUSY(as);
2184         mutex_exit(&as->a_contents);
2185 
2186         /*
2187          * Call the swapout routines of all segments in the address
2188          * space to do the actual work, accumulating the amount of
2189          * space reclaimed.
2190          */
2191         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2192                 const struct seg_ops *ov = seg->s_ops;
2193 
2194                 /*
2195                  * We have to check to see if the seg has
2196                  * an ops vector because the seg may have
2197                  * been in the middle of being set up when
2198                  * the process was picked for swapout.
2199                  */
2200                 if ((ov != NULL) && (ov->swapout != NULL))
2201                         swpcnt += segop_swapout(seg);
2202         }
2203         AS_LOCK_EXIT(as, &as->a_lock);
2204         return (swpcnt);
2205 }
2206 
2207 /*
2208  * Determine whether data from the mappings in interval [addr, addr + size)
2209  * are in the primary memory (core) cache.
2210  */
2211 int
2212 as_incore(struct as *as, caddr_t addr,
2213     size_t size, char *vec, size_t *sizep)
2214 {
2215         struct seg *seg;
2216         size_t ssize;
2217         caddr_t raddr;          /* rounded down addr */
2218         size_t rsize;           /* rounded up size */
2219         size_t isize;                   /* iteration size */
2220         int error = 0;          /* result, assume success */
2221 
2222         *sizep = 0;
2223         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2224         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2225             (size_t)raddr;
2226 
2227         if (raddr + rsize < raddr)           /* check for wraparound */
2228                 return (ENOMEM);
2229 
2230         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2231         seg = as_segat(as, raddr);
2232         if (seg == NULL) {
2233                 AS_LOCK_EXIT(as, &as->a_lock);
2234                 return (-1);
2235         }
2236 
2237         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2238                 if (raddr >= seg->s_base + seg->s_size) {
2239                         seg = AS_SEGNEXT(as, seg);
2240                         if (seg == NULL || raddr != seg->s_base) {
2241                                 error = -1;
2242                                 break;
2243                         }
2244                 }
2245                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2246                         ssize = seg->s_base + seg->s_size - raddr;
2247                 else
2248                         ssize = rsize;
2249                 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2250                 if (isize != ssize) {
2251                         error = -1;
2252                         break;
2253                 }
2254                 vec += btopr(ssize);
2255         }
2256         AS_LOCK_EXIT(as, &as->a_lock);
2257         return (error);
2258 }
2259 
2260 static void
2261 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2262         ulong_t *bitmap, size_t position, size_t npages)
2263 {
2264         caddr_t range_start;
2265         size_t  pos1 = position;
2266         size_t  pos2;
2267         size_t  size;
2268         size_t  end_pos = npages + position;
2269 
2270         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2271                 size = ptob((pos2 - pos1));
2272                 range_start = (caddr_t)((uintptr_t)addr +
2273                     ptob(pos1 - position));
2274 
2275                 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2276                     (ulong_t *)NULL, (size_t)NULL);
2277                 pos1 = pos2;
2278         }
2279 }
2280 
2281 static void
2282 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2283         caddr_t raddr, size_t rsize)
2284 {
2285         struct seg *seg = as_segat(as, raddr);
2286         size_t ssize;
2287 
2288         while (rsize != 0) {
2289                 if (raddr >= seg->s_base + seg->s_size)
2290                         seg = AS_SEGNEXT(as, seg);
2291 
2292                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2293                         ssize = seg->s_base + seg->s_size - raddr;
2294                 else
2295                         ssize = rsize;
2296 
2297                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2298 
2299                 rsize -= ssize;
2300                 raddr += ssize;
2301         }
2302 }
2303 
2304 /*
2305  * Cache control operations over the interval [addr, addr + size) in
2306  * address space "as".
2307  */
2308 /*ARGSUSED*/
2309 int
2310 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2311     uintptr_t arg, ulong_t *lock_map, size_t pos)
2312 {
2313         struct seg *seg;        /* working segment */
2314         caddr_t raddr;          /* rounded down addr */
2315         caddr_t initraddr;      /* saved initial rounded down addr */
2316         size_t rsize;           /* rounded up size */
2317         size_t initrsize;       /* saved initial rounded up size */
2318         size_t ssize;           /* size of seg */
2319         int error = 0;                  /* result */
2320         size_t mlock_size;      /* size of bitmap */
2321         ulong_t *mlock_map;     /* pointer to bitmap used */
2322                                 /* to represent the locked */
2323                                 /* pages. */
2324 retry:
2325         if (error == IE_RETRY)
2326                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2327         else
2328                 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2329 
2330         /*
2331          * If these are address space lock/unlock operations, loop over
2332          * all segments in the address space, as appropriate.
2333          */
2334         if (func == MC_LOCKAS) {
2335                 size_t npages, idx;
2336                 size_t rlen = 0;        /* rounded as length */
2337 
2338                 idx = pos;
2339 
2340                 if (arg & MCL_FUTURE) {
2341                         mutex_enter(&as->a_contents);
2342                         AS_SETPGLCK(as);
2343                         mutex_exit(&as->a_contents);
2344                 }
2345                 if ((arg & MCL_CURRENT) == 0) {
2346                         AS_LOCK_EXIT(as, &as->a_lock);
2347                         return (0);
2348                 }
2349 
2350                 seg = AS_SEGFIRST(as);
2351                 if (seg == NULL) {
2352                         AS_LOCK_EXIT(as, &as->a_lock);
2353                         return (0);
2354                 }
2355 
2356                 do {
2357                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2358                             (uintptr_t)PAGEMASK);
2359                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2360                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2361                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2362 
2363                 mlock_size = BT_BITOUL(btopr(rlen));
2364                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2365                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2366                                 AS_LOCK_EXIT(as, &as->a_lock);
2367                                 return (EAGAIN);
2368                 }
2369 
2370                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2371                         error = segop_lockop(seg, seg->s_base,
2372                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2373                         if (error != 0)
2374                                 break;
2375                         pos += seg_pages(seg);
2376                 }
2377 
2378                 if (error) {
2379                         for (seg = AS_SEGFIRST(as); seg != NULL;
2380                             seg = AS_SEGNEXT(as, seg)) {
2381 
2382                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2383                                     (uintptr_t)PAGEMASK);
2384                                 npages = seg_pages(seg);
2385                                 as_segunlock(seg, raddr, attr, mlock_map,
2386                                     idx, npages);
2387                                 idx += npages;
2388                         }
2389                 }
2390 
2391                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2392                 AS_LOCK_EXIT(as, &as->a_lock);
2393                 goto lockerr;
2394         } else if (func == MC_UNLOCKAS) {
2395                 mutex_enter(&as->a_contents);
2396                 AS_CLRPGLCK(as);
2397                 mutex_exit(&as->a_contents);
2398 
2399                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2400                         error = segop_lockop(seg, seg->s_base,
2401                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2402                         if (error != 0)
2403                                 break;
2404                 }
2405 
2406                 AS_LOCK_EXIT(as, &as->a_lock);
2407                 goto lockerr;
2408         }
2409 
2410         /*
2411          * Normalize addresses and sizes.
2412          */
2413         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2414         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2415             (size_t)raddr;
2416 
2417         if (raddr + rsize < raddr) {         /* check for wraparound */
2418                 AS_LOCK_EXIT(as, &as->a_lock);
2419                 return (ENOMEM);
2420         }
2421 
2422         /*
2423          * Get initial segment.
2424          */
2425         if ((seg = as_segat(as, raddr)) == NULL) {
2426                 AS_LOCK_EXIT(as, &as->a_lock);
2427                 return (ENOMEM);
2428         }
2429 
2430         if (func == MC_LOCK) {
2431                 mlock_size = BT_BITOUL(btopr(rsize));
2432                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2433                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2434                                 AS_LOCK_EXIT(as, &as->a_lock);
2435                                 return (EAGAIN);
2436                 }
2437         }
2438 
2439         /*
2440          * Loop over all segments.  If a hole in the address range is
2441          * discovered, then fail.  For each segment, perform the appropriate
2442          * control operation.
2443          */
2444         while (rsize != 0) {
2445 
2446                 /*
2447                  * Make sure there's no hole, calculate the portion
2448                  * of the next segment to be operated over.
2449                  */
2450                 if (raddr >= seg->s_base + seg->s_size) {
2451                         seg = AS_SEGNEXT(as, seg);
2452                         if (seg == NULL || raddr != seg->s_base) {
2453                                 if (func == MC_LOCK) {
2454                                         as_unlockerr(as, attr, mlock_map,
2455                                             initraddr, initrsize - rsize);
2456                                         kmem_free(mlock_map,
2457                                             mlock_size * sizeof (ulong_t));
2458                                 }
2459                                 AS_LOCK_EXIT(as, &as->a_lock);
2460                                 return (ENOMEM);
2461                         }
2462                 }
2463                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2464                         ssize = seg->s_base + seg->s_size - raddr;
2465                 else
2466                         ssize = rsize;
2467 
2468                 /*
2469                  * Dispatch on specific function.
2470                  */
2471                 switch (func) {
2472 
2473                 /*
2474                  * Synchronize cached data from mappings with backing
2475                  * objects.
2476                  */
2477                 case MC_SYNC:
2478                         if (error = segop_sync(seg, raddr, ssize,
2479                             attr, (uint_t)arg)) {
2480                                 AS_LOCK_EXIT(as, &as->a_lock);
2481                                 return (error);
2482                         }
2483                         break;
2484 
2485                 /*
2486                  * Lock pages in memory.
2487                  */
2488                 case MC_LOCK:
2489                         if (error = segop_lockop(seg, raddr, ssize,
2490                             attr, func, mlock_map, pos)) {
2491                                 as_unlockerr(as, attr, mlock_map, initraddr,
2492                                     initrsize - rsize + ssize);
2493                                 kmem_free(mlock_map, mlock_size *
2494                                     sizeof (ulong_t));
2495                                 AS_LOCK_EXIT(as, &as->a_lock);
2496                                 goto lockerr;
2497                         }
2498                         break;
2499 
2500                 /*
2501                  * Unlock mapped pages.
2502                  */
2503                 case MC_UNLOCK:
2504                         (void) segop_lockop(seg, raddr, ssize, attr, func,
2505                             (ulong_t *)NULL, (size_t)NULL);
2506                         break;
2507 
2508                 /*
2509                  * Store VM advise for mapped pages in segment layer.
2510                  */
2511                 case MC_ADVISE:
2512                         error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2513 
2514                         /*
2515                          * Check for regular errors and special retry error
2516                          */
2517                         if (error) {
2518                                 if (error == IE_RETRY) {
2519                                         /*
2520                                          * Need to acquire writers lock, so
2521                                          * have to drop readers lock and start
2522                                          * all over again
2523                                          */
2524                                         AS_LOCK_EXIT(as, &as->a_lock);
2525                                         goto retry;
2526                                 } else if (error == IE_REATTACH) {
2527                                         /*
2528                                          * Find segment for current address
2529                                          * because current segment just got
2530                                          * split or concatenated
2531                                          */
2532                                         seg = as_segat(as, raddr);
2533                                         if (seg == NULL) {
2534                                                 AS_LOCK_EXIT(as, &as->a_lock);
2535                                                 return (ENOMEM);
2536                                         }
2537                                 } else {
2538                                         /*
2539                                          * Regular error
2540                                          */
2541                                         AS_LOCK_EXIT(as, &as->a_lock);
2542                                         return (error);
2543                                 }
2544                         }
2545                         break;
2546 
2547                 case MC_INHERIT_ZERO:
2548                         error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2549                         if (error != 0) {
2550                                 AS_LOCK_EXIT(as, &as->a_lock);
2551                                 return (error);
2552                         }
2553                         break;
2554 
2555                 /*
2556                  * Can't happen.
2557                  */
2558                 default:
2559                         panic("as_ctl: bad operation %d", func);
2560                         /*NOTREACHED*/
2561                 }
2562 
2563                 rsize -= ssize;
2564                 raddr += ssize;
2565         }
2566 
2567         if (func == MC_LOCK)
2568                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2569         AS_LOCK_EXIT(as, &as->a_lock);
2570         return (0);
2571 lockerr:
2572 
2573         /*
2574          * If the lower levels returned EDEADLK for a segment lockop,
2575          * it means that we should retry the operation.  Let's wait
2576          * a bit also to let the deadlock causing condition clear.
2577          * This is part of a gross hack to work around a design flaw
2578          * in the ufs/sds logging code and should go away when the
2579          * logging code is re-designed to fix the problem. See bug
2580          * 4125102 for details of the problem.
2581          */
2582         if (error == EDEADLK) {
2583                 delay(deadlk_wait);
2584                 error = 0;
2585                 goto retry;
2586         }
2587         return (error);
2588 }
2589 
2590 int
2591 fc_decode(faultcode_t fault_err)
2592 {
2593         int error = 0;
2594 
2595         switch (FC_CODE(fault_err)) {
2596         case FC_OBJERR:
2597                 error = FC_ERRNO(fault_err);
2598                 break;
2599         case FC_PROT:
2600                 error = EACCES;
2601                 break;
2602         default:
2603                 error = EFAULT;
2604                 break;
2605         }
2606         return (error);
2607 }
2608 
2609 /*
2610  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2611  * lists from each segment and copy them to one contiguous shadow list (plist)
2612  * as expected by the caller.  Save pointers to per segment shadow lists at
2613  * the tail of plist so that they can be used during as_pageunlock().
2614  */
2615 static int
2616 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2617     caddr_t addr, size_t size, enum seg_rw rw)
2618 {
2619         caddr_t sv_addr = addr;
2620         size_t sv_size = size;
2621         struct seg *sv_seg = seg;
2622         ulong_t segcnt = 1;
2623         ulong_t cnt;
2624         size_t ssize;
2625         pgcnt_t npages = btop(size);
2626         page_t **plist;
2627         page_t **pl;
2628         int error;
2629         caddr_t eaddr;
2630         faultcode_t fault_err = 0;
2631         pgcnt_t pl_off;
2632         extern const struct seg_ops segspt_shmops;
2633 
2634         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2635         ASSERT(seg != NULL);
2636         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2637         ASSERT(addr + size > seg->s_base + seg->s_size);
2638         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2639         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2640 
2641         /*
2642          * Count the number of segments covered by the range we are about to
2643          * lock. The segment count is used to size the shadow list we return
2644          * back to the caller.
2645          */
2646         for (; size != 0; size -= ssize, addr += ssize) {
2647                 if (addr >= seg->s_base + seg->s_size) {
2648 
2649                         seg = AS_SEGNEXT(as, seg);
2650                         if (seg == NULL || addr != seg->s_base) {
2651                                 AS_LOCK_EXIT(as, &as->a_lock);
2652                                 return (EFAULT);
2653                         }
2654                         /*
2655                          * Do a quick check if subsequent segments
2656                          * will most likely support pagelock.
2657                          */
2658                         if (seg->s_ops == &segvn_ops) {
2659                                 vnode_t *vp;
2660 
2661                                 if (segop_getvp(seg, addr, &vp) != 0 ||
2662                                     vp != NULL) {
2663                                         AS_LOCK_EXIT(as, &as->a_lock);
2664                                         goto slow;
2665                                 }
2666                         } else if (seg->s_ops != &segspt_shmops) {
2667                                 AS_LOCK_EXIT(as, &as->a_lock);
2668                                 goto slow;
2669                         }
2670                         segcnt++;
2671                 }
2672                 if (addr + size > seg->s_base + seg->s_size) {
2673                         ssize = seg->s_base + seg->s_size - addr;
2674                 } else {
2675                         ssize = size;
2676                 }
2677         }
2678         ASSERT(segcnt > 1);
2679 
2680         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2681 
2682         addr = sv_addr;
2683         size = sv_size;
2684         seg = sv_seg;
2685 
2686         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2687                 if (addr >= seg->s_base + seg->s_size) {
2688                         seg = AS_SEGNEXT(as, seg);
2689                         ASSERT(seg != NULL && addr == seg->s_base);
2690                         cnt++;
2691                         ASSERT(cnt < segcnt);
2692                 }
2693                 if (addr + size > seg->s_base + seg->s_size) {
2694                         ssize = seg->s_base + seg->s_size - addr;
2695                 } else {
2696                         ssize = size;
2697                 }
2698                 pl = &plist[npages + cnt];
2699                 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2700                     L_PAGELOCK, rw);
2701                 if (error) {
2702                         break;
2703                 }
2704                 ASSERT(plist[npages + cnt] != NULL);
2705                 ASSERT(pl_off + btop(ssize) <= npages);
2706                 bcopy(plist[npages + cnt], &plist[pl_off],
2707                     btop(ssize) * sizeof (page_t *));
2708                 pl_off += btop(ssize);
2709         }
2710 
2711         if (size == 0) {
2712                 AS_LOCK_EXIT(as, &as->a_lock);
2713                 ASSERT(cnt == segcnt - 1);
2714                 *ppp = plist;
2715                 return (0);
2716         }
2717 
2718         /*
2719          * one of pagelock calls failed. The error type is in error variable.
2720          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2721          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2722          * back to the caller.
2723          */
2724 
2725         eaddr = addr;
2726         seg = sv_seg;
2727 
2728         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2729                 if (addr >= seg->s_base + seg->s_size) {
2730                         seg = AS_SEGNEXT(as, seg);
2731                         ASSERT(seg != NULL && addr == seg->s_base);
2732                         cnt++;
2733                         ASSERT(cnt < segcnt);
2734                 }
2735                 if (eaddr > seg->s_base + seg->s_size) {
2736                         ssize = seg->s_base + seg->s_size - addr;
2737                 } else {
2738                         ssize = eaddr - addr;
2739                 }
2740                 pl = &plist[npages + cnt];
2741                 ASSERT(*pl != NULL);
2742                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2743                     L_PAGEUNLOCK, rw);
2744         }
2745 
2746         AS_LOCK_EXIT(as, &as->a_lock);
2747 
2748         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2749 
2750         if (error != ENOTSUP && error != EFAULT) {
2751                 return (error);
2752         }
2753 
2754 slow:
2755         /*
2756          * If we are here because pagelock failed due to the need to cow fault
2757          * in the pages we want to lock F_SOFTLOCK will do this job and in
2758          * next as_pagelock() call for this address range pagelock will
2759          * hopefully succeed.
2760          */
2761         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2762         if (fault_err != 0) {
2763                 return (fc_decode(fault_err));
2764         }
2765         *ppp = NULL;
2766 
2767         return (0);
2768 }
2769 
2770 /*
2771  * lock pages in a given address space. Return shadow list. If
2772  * the list is NULL, the MMU mapping is also locked.
2773  */
2774 int
2775 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2776     size_t size, enum seg_rw rw)
2777 {
2778         size_t rsize;
2779         caddr_t raddr;
2780         faultcode_t fault_err;
2781         struct seg *seg;
2782         int err;
2783 
2784         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2785             "as_pagelock_start: addr %p size %ld", addr, size);
2786 
2787         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2788         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2789             (size_t)raddr;
2790 
2791         /*
2792          * if the request crosses two segments let
2793          * as_fault handle it.
2794          */
2795         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2796 
2797         seg = as_segat(as, raddr);
2798         if (seg == NULL) {
2799                 AS_LOCK_EXIT(as, &as->a_lock);
2800                 return (EFAULT);
2801         }
2802         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2803         if (raddr + rsize > seg->s_base + seg->s_size) {
2804                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2805         }
2806         if (raddr + rsize <= raddr) {
2807                 AS_LOCK_EXIT(as, &as->a_lock);
2808                 return (EFAULT);
2809         }
2810 
2811         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2812             "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2813 
2814         /*
2815          * try to lock pages and pass back shadow list
2816          */
2817         err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2818 
2819         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2820 
2821         AS_LOCK_EXIT(as, &as->a_lock);
2822 
2823         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2824                 return (err);
2825         }
2826 
2827         /*
2828          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2829          * to no pagelock support for this segment or pages need to be cow
2830          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2831          * this as_pagelock() call and in the next as_pagelock() call for the
2832          * same address range pagelock call will hopefull succeed.
2833          */
2834         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2835         if (fault_err != 0) {
2836                 return (fc_decode(fault_err));
2837         }
2838         *ppp = NULL;
2839 
2840         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2841         return (0);
2842 }
2843 
2844 /*
2845  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2846  * lists from the end of plist and call pageunlock interface for each segment.
2847  * Drop as lock and free plist.
2848  */
2849 static void
2850 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2851     struct page **plist, enum seg_rw rw)
2852 {
2853         ulong_t cnt;
2854         caddr_t eaddr = addr + size;
2855         pgcnt_t npages = btop(size);
2856         size_t ssize;
2857         page_t **pl;
2858 
2859         ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2860         ASSERT(seg != NULL);
2861         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2862         ASSERT(addr + size > seg->s_base + seg->s_size);
2863         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2864         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2865         ASSERT(plist != NULL);
2866 
2867         for (cnt = 0; addr < eaddr; addr += ssize) {
2868                 if (addr >= seg->s_base + seg->s_size) {
2869                         seg = AS_SEGNEXT(as, seg);
2870                         ASSERT(seg != NULL && addr == seg->s_base);
2871                         cnt++;
2872                 }
2873                 if (eaddr > seg->s_base + seg->s_size) {
2874                         ssize = seg->s_base + seg->s_size - addr;
2875                 } else {
2876                         ssize = eaddr - addr;
2877                 }
2878                 pl = &plist[npages + cnt];
2879                 ASSERT(*pl != NULL);
2880                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2881                     L_PAGEUNLOCK, rw);
2882         }
2883         ASSERT(cnt > 0);
2884         AS_LOCK_EXIT(as, &as->a_lock);
2885 
2886         cnt++;
2887         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2888 }
2889 
2890 /*
2891  * unlock pages in a given address range
2892  */
2893 void
2894 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2895     enum seg_rw rw)
2896 {
2897         struct seg *seg;
2898         size_t rsize;
2899         caddr_t raddr;
2900 
2901         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2902             "as_pageunlock_start: addr %p size %ld", addr, size);
2903 
2904         /*
2905          * if the shadow list is NULL, as_pagelock was
2906          * falling back to as_fault
2907          */
2908         if (pp == NULL) {
2909                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2910                 return;
2911         }
2912 
2913         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2914         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2915             (size_t)raddr;
2916 
2917         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2918         seg = as_segat(as, raddr);
2919         ASSERT(seg != NULL);
2920 
2921         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2922             "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2923 
2924         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2925         if (raddr + rsize <= seg->s_base + seg->s_size) {
2926                 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2927         } else {
2928                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2929                 return;
2930         }
2931         AS_LOCK_EXIT(as, &as->a_lock);
2932         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2933 }
2934 
2935 int
2936 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2937     boolean_t wait)
2938 {
2939         struct seg *seg;
2940         size_t ssize;
2941         caddr_t raddr;                  /* rounded down addr */
2942         size_t rsize;                   /* rounded up size */
2943         int error = 0;
2944         size_t pgsz = page_get_pagesize(szc);
2945 
2946 setpgsz_top:
2947         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2948                 return (EINVAL);
2949         }
2950 
2951         raddr = addr;
2952         rsize = size;
2953 
2954         if (raddr + rsize < raddr)           /* check for wraparound */
2955                 return (ENOMEM);
2956 
2957         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2958         as_clearwatchprot(as, raddr, rsize);
2959         seg = as_segat(as, raddr);
2960         if (seg == NULL) {
2961                 as_setwatch(as);
2962                 AS_LOCK_EXIT(as, &as->a_lock);
2963                 return (ENOMEM);
2964         }
2965 
2966         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2967                 if (raddr >= seg->s_base + seg->s_size) {
2968                         seg = AS_SEGNEXT(as, seg);
2969                         if (seg == NULL || raddr != seg->s_base) {
2970                                 error = ENOMEM;
2971                                 break;
2972                         }
2973                 }
2974                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2975                         ssize = seg->s_base + seg->s_size - raddr;
2976                 } else {
2977                         ssize = rsize;
2978                 }
2979 
2980 retry:
2981                 error = segop_setpagesize(seg, raddr, ssize, szc);
2982 
2983                 if (error == IE_NOMEM) {
2984                         error = EAGAIN;
2985                         break;
2986                 }
2987 
2988                 if (error == IE_RETRY) {
2989                         AS_LOCK_EXIT(as, &as->a_lock);
2990                         goto setpgsz_top;
2991                 }
2992 
2993                 if (error == ENOTSUP) {
2994                         error = EINVAL;
2995                         break;
2996                 }
2997 
2998                 if (wait && (error == EAGAIN)) {
2999                         /*
3000                          * Memory is currently locked.  It must be unlocked
3001                          * before this operation can succeed through a retry.
3002                          * The possible reasons for locked memory and
3003                          * corresponding strategies for unlocking are:
3004                          * (1) Normal I/O
3005                          *      wait for a signal that the I/O operation
3006                          *      has completed and the memory is unlocked.
3007                          * (2) Asynchronous I/O
3008                          *      The aio subsystem does not unlock pages when
3009                          *      the I/O is completed. Those pages are unlocked
3010                          *      when the application calls aiowait/aioerror.
3011                          *      So, to prevent blocking forever, cv_broadcast()
3012                          *      is done to wake up aio_cleanup_thread.
3013                          *      Subsequently, segvn_reclaim will be called, and
3014                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
3015                          * (3) Long term page locking:
3016                          *      This is not relevant for as_setpagesize()
3017                          *      because we cannot change the page size for
3018                          *      driver memory. The attempt to do so will
3019                          *      fail with a different error than EAGAIN so
3020                          *      there's no need to trigger as callbacks like
3021                          *      as_unmap, as_setprot or as_free would do.
3022                          */
3023                         mutex_enter(&as->a_contents);
3024                         if (!AS_ISNOUNMAPWAIT(as)) {
3025                                 if (AS_ISUNMAPWAIT(as) == 0) {
3026                                         cv_broadcast(&as->a_cv);
3027                                 }
3028                                 AS_SETUNMAPWAIT(as);
3029                                 AS_LOCK_EXIT(as, &as->a_lock);
3030                                 while (AS_ISUNMAPWAIT(as)) {
3031                                         cv_wait(&as->a_cv, &as->a_contents);
3032                                 }
3033                         } else {
3034                                 /*
3035                                  * We may have raced with
3036                                  * segvn_reclaim()/segspt_reclaim(). In this
3037                                  * case clean nounmapwait flag and retry since
3038                                  * softlockcnt in this segment may be already
3039                                  * 0.  We don't drop as writer lock so our
3040                                  * number of retries without sleeping should
3041                                  * be very small. See segvn_reclaim() for
3042                                  * more comments.
3043                                  */
3044                                 AS_CLRNOUNMAPWAIT(as);
3045                                 mutex_exit(&as->a_contents);
3046                                 goto retry;
3047                         }
3048                         mutex_exit(&as->a_contents);
3049                         goto setpgsz_top;
3050                 } else if (error != 0) {
3051                         break;
3052                 }
3053         }
3054         as_setwatch(as);
3055         AS_LOCK_EXIT(as, &as->a_lock);
3056         return (error);
3057 }
3058 
3059 /*
3060  * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
3061  * in its chunk where s_szc is less than the szc we want to set.
3062  */
3063 static int
3064 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3065     int *retry)
3066 {
3067         struct seg *seg;
3068         size_t ssize;
3069         int error;
3070 
3071         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3072 
3073         seg = as_segat(as, raddr);
3074         if (seg == NULL) {
3075                 panic("as_iset3_default_lpsize: no seg");
3076         }
3077 
3078         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3079                 if (raddr >= seg->s_base + seg->s_size) {
3080                         seg = AS_SEGNEXT(as, seg);
3081                         if (seg == NULL || raddr != seg->s_base) {
3082                                 panic("as_iset3_default_lpsize: as changed");
3083                         }
3084                 }
3085                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3086                         ssize = seg->s_base + seg->s_size - raddr;
3087                 } else {
3088                         ssize = rsize;
3089                 }
3090 
3091                 if (szc > seg->s_szc) {
3092                         error = segop_setpagesize(seg, raddr, ssize, szc);
3093                         /* Only retry on EINVAL segments that have no vnode. */
3094                         if (error == EINVAL) {
3095                                 vnode_t *vp = NULL;
3096                                 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
3097                                     (segop_getvp(seg, raddr, &vp) != 0 ||
3098                                     vp == NULL)) {
3099                                         *retry = 1;
3100                                 } else {
3101                                         *retry = 0;
3102                                 }
3103                         }
3104                         if (error) {
3105                                 return (error);
3106                         }
3107                 }
3108         }
3109         return (0);
3110 }
3111 
3112 /*
3113  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3114  * pagesize on each segment in its range, but if any fails with EINVAL,
3115  * then it reduces the pagesizes to the next size in the bitmap and
3116  * retries as_iset3_default_lpsize(). The reason why the code retries
3117  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3118  * match the bigger sizes, and (b) it's hard to get this offset (to begin
3119  * with) to pass to map_pgszcvec().
3120  */
3121 static int
3122 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3123     uint_t szcvec)
3124 {
3125         int error;
3126         int retry;
3127 
3128         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3129 
3130         for (;;) {
3131                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3132                 if (error == EINVAL && retry) {
3133                         szcvec &= ~(1 << szc);
3134                         if (szcvec <= 1) {
3135                                 return (EINVAL);
3136                         }
3137                         szc = highbit(szcvec) - 1;
3138                 } else {
3139                         return (error);
3140                 }
3141         }
3142 }
3143 
3144 /*
3145  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3146  * segments have a smaller szc than we want to set. For each such area,
3147  * it calls as_iset2_default_lpsize()
3148  */
3149 static int
3150 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3151     uint_t szcvec)
3152 {
3153         struct seg *seg;
3154         size_t ssize;
3155         caddr_t setaddr = raddr;
3156         size_t setsize = 0;
3157         int set;
3158         int error;
3159 
3160         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3161 
3162         seg = as_segat(as, raddr);
3163         if (seg == NULL) {
3164                 panic("as_iset1_default_lpsize: no seg");
3165         }
3166         if (seg->s_szc < szc) {
3167                 set = 1;
3168         } else {
3169                 set = 0;
3170         }
3171 
3172         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3173                 if (raddr >= seg->s_base + seg->s_size) {
3174                         seg = AS_SEGNEXT(as, seg);
3175                         if (seg == NULL || raddr != seg->s_base) {
3176                                 panic("as_iset1_default_lpsize: as changed");
3177                         }
3178                         if (seg->s_szc >= szc && set) {
3179                                 ASSERT(setsize != 0);
3180                                 error = as_iset2_default_lpsize(as,
3181                                     setaddr, setsize, szc, szcvec);
3182                                 if (error) {
3183                                         return (error);
3184                                 }
3185                                 set = 0;
3186                         } else if (seg->s_szc < szc && !set) {
3187                                 setaddr = raddr;
3188                                 setsize = 0;
3189                                 set = 1;
3190                         }
3191                 }
3192                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3193                         ssize = seg->s_base + seg->s_size - raddr;
3194                 } else {
3195                         ssize = rsize;
3196                 }
3197         }
3198         error = 0;
3199         if (set) {
3200                 ASSERT(setsize != 0);
3201                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3202                     szc, szcvec);
3203         }
3204         return (error);
3205 }
3206 
3207 /*
3208  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3209  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3210  * chunk to as_iset1_default_lpsize().
3211  */
3212 static int
3213 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3214     int type)
3215 {
3216         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3217         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3218             flags, rtype, 1);
3219         uint_t szc;
3220         uint_t nszc;
3221         int error;
3222         caddr_t a;
3223         caddr_t eaddr;
3224         size_t segsize;
3225         size_t pgsz;
3226         uint_t save_szcvec;
3227 
3228         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3229         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3230         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3231 
3232         szcvec &= ~1;
3233         if (szcvec <= 1) {   /* skip if base page size */
3234                 return (0);
3235         }
3236 
3237         /* Get the pagesize of the first larger page size. */
3238         szc = lowbit(szcvec) - 1;
3239         pgsz = page_get_pagesize(szc);
3240         eaddr = addr + size;
3241         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3242         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3243 
3244         save_szcvec = szcvec;
3245         szcvec >>= (szc + 1);
3246         nszc = szc;
3247         while (szcvec) {
3248                 if ((szcvec & 0x1) == 0) {
3249                         nszc++;
3250                         szcvec >>= 1;
3251                         continue;
3252                 }
3253                 nszc++;
3254                 pgsz = page_get_pagesize(nszc);
3255                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3256                 if (a != addr) {
3257                         ASSERT(szc > 0);
3258                         ASSERT(a < eaddr);
3259                         segsize = a - addr;
3260                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3261                             save_szcvec);
3262                         if (error) {
3263                                 return (error);
3264                         }
3265                         addr = a;
3266                 }
3267                 szc = nszc;
3268                 szcvec >>= 1;
3269         }
3270 
3271         ASSERT(addr < eaddr);
3272         szcvec = save_szcvec;
3273         while (szcvec) {
3274                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3275                 ASSERT(a >= addr);
3276                 if (a != addr) {
3277                         ASSERT(szc > 0);
3278                         segsize = a - addr;
3279                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3280                             save_szcvec);
3281                         if (error) {
3282                                 return (error);
3283                         }
3284                         addr = a;
3285                 }
3286                 szcvec &= ~(1 << szc);
3287                 if (szcvec) {
3288                         szc = highbit(szcvec) - 1;
3289                         pgsz = page_get_pagesize(szc);
3290                 }
3291         }
3292         ASSERT(addr == eaddr);
3293 
3294         return (0);
3295 }
3296 
3297 /*
3298  * Set the default large page size for the range. Called via memcntl with
3299  * page size set to 0. as_set_default_lpsize breaks the range down into
3300  * chunks with the same type/flags, ignores-non segvn segments, and passes
3301  * each chunk to as_iset_default_lpsize().
3302  */
3303 int
3304 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3305 {
3306         struct seg *seg;
3307         caddr_t raddr;
3308         size_t rsize;
3309         size_t ssize;
3310         int rtype, rflags;
3311         int stype, sflags;
3312         int error;
3313         caddr_t setaddr;
3314         size_t setsize;
3315         int segvn;
3316 
3317         if (size == 0)
3318                 return (0);
3319 
3320         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3321 again:
3322         error = 0;
3323 
3324         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3325         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3326             (size_t)raddr;
3327 
3328         if (raddr + rsize < raddr) {         /* check for wraparound */
3329                 AS_LOCK_EXIT(as, &as->a_lock);
3330                 return (ENOMEM);
3331         }
3332         as_clearwatchprot(as, raddr, rsize);
3333         seg = as_segat(as, raddr);
3334         if (seg == NULL) {
3335                 as_setwatch(as);
3336                 AS_LOCK_EXIT(as, &as->a_lock);
3337                 return (ENOMEM);
3338         }
3339         if (seg->s_ops == &segvn_ops) {
3340                 rtype = segop_gettype(seg, addr);
3341                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3342                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3343                 segvn = 1;
3344         } else {
3345                 segvn = 0;
3346         }
3347         setaddr = raddr;
3348         setsize = 0;
3349 
3350         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3351                 if (raddr >= (seg->s_base + seg->s_size)) {
3352                         seg = AS_SEGNEXT(as, seg);
3353                         if (seg == NULL || raddr != seg->s_base) {
3354                                 error = ENOMEM;
3355                                 break;
3356                         }
3357                         if (seg->s_ops == &segvn_ops) {
3358                                 stype = segop_gettype(seg, raddr);
3359                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3360                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3361                                 if (segvn && (rflags != sflags ||
3362                                     rtype != stype)) {
3363                                         /*
3364                                          * The next segment is also segvn but
3365                                          * has different flags and/or type.
3366                                          */
3367                                         ASSERT(setsize != 0);
3368                                         error = as_iset_default_lpsize(as,
3369                                             setaddr, setsize, rflags, rtype);
3370                                         if (error) {
3371                                                 break;
3372                                         }
3373                                         rflags = sflags;
3374                                         rtype = stype;
3375                                         setaddr = raddr;
3376                                         setsize = 0;
3377                                 } else if (!segvn) {
3378                                         rflags = sflags;
3379                                         rtype = stype;
3380                                         setaddr = raddr;
3381                                         setsize = 0;
3382                                         segvn = 1;
3383                                 }
3384                         } else if (segvn) {
3385                                 /* The next segment is not segvn. */
3386                                 ASSERT(setsize != 0);
3387                                 error = as_iset_default_lpsize(as,
3388                                     setaddr, setsize, rflags, rtype);
3389                                 if (error) {
3390                                         break;
3391                                 }
3392                                 segvn = 0;
3393                         }
3394                 }
3395                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3396                         ssize = seg->s_base + seg->s_size - raddr;
3397                 } else {
3398                         ssize = rsize;
3399                 }
3400         }
3401         if (error == 0 && segvn) {
3402                 /* The last chunk when rsize == 0. */
3403                 ASSERT(setsize != 0);
3404                 error = as_iset_default_lpsize(as, setaddr, setsize,
3405                     rflags, rtype);
3406         }
3407 
3408         if (error == IE_RETRY) {
3409                 goto again;
3410         } else if (error == IE_NOMEM) {
3411                 error = EAGAIN;
3412         } else if (error == ENOTSUP) {
3413                 error = EINVAL;
3414         } else if (error == EAGAIN) {
3415                 mutex_enter(&as->a_contents);
3416                 if (!AS_ISNOUNMAPWAIT(as)) {
3417                         if (AS_ISUNMAPWAIT(as) == 0) {
3418                                 cv_broadcast(&as->a_cv);
3419                         }
3420                         AS_SETUNMAPWAIT(as);
3421                         AS_LOCK_EXIT(as, &as->a_lock);
3422                         while (AS_ISUNMAPWAIT(as)) {
3423                                 cv_wait(&as->a_cv, &as->a_contents);
3424                         }
3425                         mutex_exit(&as->a_contents);
3426                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3427                 } else {
3428                         /*
3429                          * We may have raced with
3430                          * segvn_reclaim()/segspt_reclaim(). In this case
3431                          * clean nounmapwait flag and retry since softlockcnt
3432                          * in this segment may be already 0.  We don't drop as
3433                          * writer lock so our number of retries without
3434                          * sleeping should be very small. See segvn_reclaim()
3435                          * for more comments.
3436                          */
3437                         AS_CLRNOUNMAPWAIT(as);
3438                         mutex_exit(&as->a_contents);
3439                 }
3440                 goto again;
3441         }
3442 
3443         as_setwatch(as);
3444         AS_LOCK_EXIT(as, &as->a_lock);
3445         return (error);
3446 }
3447 
3448 /*
3449  * Setup all of the uninitialized watched pages that we can.
3450  */
3451 void
3452 as_setwatch(struct as *as)
3453 {
3454         struct watched_page *pwp;
3455         struct seg *seg;
3456         caddr_t vaddr;
3457         uint_t prot;
3458         int  err, retrycnt;
3459 
3460         if (avl_numnodes(&as->a_wpage) == 0)
3461                 return;
3462 
3463         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3464 
3465         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3466             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3467                 retrycnt = 0;
3468         retry:
3469                 vaddr = pwp->wp_vaddr;
3470                 if (pwp->wp_oprot != 0 ||    /* already set up */
3471                     (seg = as_segat(as, vaddr)) == NULL ||
3472                     segop_getprot(seg, vaddr, 0, &prot) != 0)
3473                         continue;
3474 
3475                 pwp->wp_oprot = prot;
3476                 if (pwp->wp_read)
3477                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3478                 if (pwp->wp_write)
3479                         prot &= ~PROT_WRITE;
3480                 if (pwp->wp_exec)
3481                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3482                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3483                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3484                         if (err == IE_RETRY) {
3485                                 pwp->wp_oprot = 0;
3486                                 ASSERT(retrycnt == 0);
3487                                 retrycnt++;
3488                                 goto retry;
3489                         }
3490                 }
3491                 pwp->wp_prot = prot;
3492         }
3493 }
3494 
3495 /*
3496  * Clear all of the watched pages in the address space.
3497  */
3498 void
3499 as_clearwatch(struct as *as)
3500 {
3501         struct watched_page *pwp;
3502         struct seg *seg;
3503         caddr_t vaddr;
3504         uint_t prot;
3505         int err, retrycnt;
3506 
3507         if (avl_numnodes(&as->a_wpage) == 0)
3508                 return;
3509 
3510         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3511 
3512         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3513             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3514                 retrycnt = 0;
3515         retry:
3516                 vaddr = pwp->wp_vaddr;
3517                 if (pwp->wp_oprot == 0 ||    /* not set up */
3518                     (seg = as_segat(as, vaddr)) == NULL)
3519                         continue;
3520 
3521                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3522                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3523                         if (err == IE_RETRY) {
3524                                 ASSERT(retrycnt == 0);
3525                                 retrycnt++;
3526                                 goto retry;
3527                         }
3528                 }
3529                 pwp->wp_oprot = 0;
3530                 pwp->wp_prot = 0;
3531         }
3532 }
3533 
3534 /*
3535  * Force a new setup for all the watched pages in the range.
3536  */
3537 static void
3538 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3539 {
3540         struct watched_page *pwp;
3541         struct watched_page tpw;
3542         caddr_t eaddr = addr + size;
3543         caddr_t vaddr;
3544         struct seg *seg;
3545         int err, retrycnt;
3546         uint_t  wprot;
3547         avl_index_t where;
3548 
3549         if (avl_numnodes(&as->a_wpage) == 0)
3550                 return;
3551 
3552         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3553 
3554         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3555         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3556                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3557 
3558         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3559                 retrycnt = 0;
3560                 vaddr = pwp->wp_vaddr;
3561 
3562                 wprot = prot;
3563                 if (pwp->wp_read)
3564                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3565                 if (pwp->wp_write)
3566                         wprot &= ~PROT_WRITE;
3567                 if (pwp->wp_exec)
3568                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3569                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3570                 retry:
3571                         seg = as_segat(as, vaddr);
3572                         if (seg == NULL) {
3573                                 panic("as_setwatchprot: no seg");
3574                                 /*NOTREACHED*/
3575                         }
3576                         err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3577                         if (err == IE_RETRY) {
3578                                 ASSERT(retrycnt == 0);
3579                                 retrycnt++;
3580                                 goto retry;
3581                         }
3582                 }
3583                 pwp->wp_oprot = prot;
3584                 pwp->wp_prot = wprot;
3585 
3586                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3587         }
3588 }
3589 
3590 /*
3591  * Clear all of the watched pages in the range.
3592  */
3593 static void
3594 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3595 {
3596         caddr_t eaddr = addr + size;
3597         struct watched_page *pwp;
3598         struct watched_page tpw;
3599         uint_t prot;
3600         struct seg *seg;
3601         int err, retrycnt;
3602         avl_index_t where;
3603 
3604         if (avl_numnodes(&as->a_wpage) == 0)
3605                 return;
3606 
3607         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3608         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3609                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3610 
3611         ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3612 
3613         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3614 
3615                 if ((prot = pwp->wp_oprot) != 0) {
3616                         retrycnt = 0;
3617 
3618                         if (prot != pwp->wp_prot) {
3619                         retry:
3620                                 seg = as_segat(as, pwp->wp_vaddr);
3621                                 if (seg == NULL)
3622                                         continue;
3623                                 err = segop_setprot(seg, pwp->wp_vaddr,
3624                                     PAGESIZE, prot);
3625                                 if (err == IE_RETRY) {
3626                                         ASSERT(retrycnt == 0);
3627                                         retrycnt++;
3628                                         goto retry;
3629 
3630                                 }
3631                         }
3632                         pwp->wp_oprot = 0;
3633                         pwp->wp_prot = 0;
3634                 }
3635 
3636                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3637         }
3638 }
3639 
3640 void
3641 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3642 {
3643         struct proc *p;
3644 
3645         mutex_enter(&pidlock);
3646         for (p = practive; p; p = p->p_next) {
3647                 if (p->p_as == as) {
3648                         mutex_enter(&p->p_lock);
3649                         if (p->p_as == as)
3650                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3651                         mutex_exit(&p->p_lock);
3652                 }
3653         }
3654         mutex_exit(&pidlock);
3655 }
3656 
3657 /*
3658  * return memory object ID
3659  */
3660 int
3661 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3662 {
3663         struct seg      *seg;
3664         int             sts;
3665 
3666         AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3667         seg = as_segat(as, addr);
3668         if (seg == NULL) {
3669                 AS_LOCK_EXIT(as, &as->a_lock);
3670                 return (EFAULT);
3671         }
3672 
3673         sts = segop_getmemid(seg, addr, memidp);
3674 
3675         AS_LOCK_EXIT(as, &as->a_lock);
3676         return (sts);
3677 }