1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2015, Joyent, Inc. All rights reserved. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 /* 41 * VM - address spaces. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/param.h> 47 #include <sys/errno.h> 48 #include <sys/systm.h> 49 #include <sys/mman.h> 50 #include <sys/sysmacros.h> 51 #include <sys/cpuvar.h> 52 #include <sys/sysinfo.h> 53 #include <sys/kmem.h> 54 #include <sys/vnode.h> 55 #include <sys/vmsystm.h> 56 #include <sys/cmn_err.h> 57 #include <sys/debug.h> 58 #include <sys/tnf_probe.h> 59 #include <sys/vtrace.h> 60 61 #include <vm/hat.h> 62 #include <vm/xhat.h> 63 #include <vm/as.h> 64 #include <vm/seg.h> 65 #include <vm/seg_vn.h> 66 #include <vm/seg_dev.h> 67 #include <vm/seg_kmem.h> 68 #include <vm/seg_map.h> 69 #include <vm/seg_spt.h> 70 #include <vm/page.h> 71 72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 73 74 static struct kmem_cache *as_cache; 75 76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 77 static void as_clearwatchprot(struct as *, caddr_t, size_t); 78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 79 80 81 /* 82 * Verifying the segment lists is very time-consuming; it may not be 83 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 84 */ 85 #ifdef DEBUG 86 #define VERIFY_SEGLIST 87 int do_as_verify = 0; 88 #endif 89 90 /* 91 * Allocate a new callback data structure entry and fill in the events of 92 * interest, the address range of interest, and the callback argument. 93 * Link the entry on the as->a_callbacks list. A callback entry for the 94 * entire address space may be specified with vaddr = 0 and size = -1. 95 * 96 * CALLERS RESPONSIBILITY: If not calling from within the process context for 97 * the specified as, the caller must guarantee persistence of the specified as 98 * for the duration of this function (eg. pages being locked within the as 99 * will guarantee persistence). 100 */ 101 int 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 103 caddr_t vaddr, size_t size, int sleepflag) 104 { 105 struct as_callback *current_head, *cb; 106 caddr_t saddr; 107 size_t rsize; 108 109 /* callback function and an event are mandatory */ 110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 111 return (EINVAL); 112 113 /* Adding a callback after as_free has been called is not allowed */ 114 if (as == &kas) 115 return (ENOMEM); 116 117 /* 118 * vaddr = 0 and size = -1 is used to indicate that the callback range 119 * is the entire address space so no rounding is done in that case. 120 */ 121 if (size != -1) { 122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 124 (size_t)saddr; 125 /* check for wraparound */ 126 if (saddr + rsize < saddr) 127 return (ENOMEM); 128 } else { 129 if (vaddr != 0) 130 return (EINVAL); 131 saddr = vaddr; 132 rsize = size; 133 } 134 135 /* Allocate and initialize a callback entry */ 136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 137 if (cb == NULL) 138 return (EAGAIN); 139 140 cb->ascb_func = cb_func; 141 cb->ascb_arg = arg; 142 cb->ascb_events = events; 143 cb->ascb_saddr = saddr; 144 cb->ascb_len = rsize; 145 146 /* Add the entry to the list */ 147 mutex_enter(&as->a_contents); 148 current_head = as->a_callbacks; 149 as->a_callbacks = cb; 150 cb->ascb_next = current_head; 151 152 /* 153 * The call to this function may lose in a race with 154 * a pertinent event - eg. a thread does long term memory locking 155 * but before the callback is added another thread executes as_unmap. 156 * A broadcast here resolves that. 157 */ 158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 159 AS_CLRUNMAPWAIT(as); 160 cv_broadcast(&as->a_cv); 161 } 162 163 mutex_exit(&as->a_contents); 164 return (0); 165 } 166 167 /* 168 * Search the callback list for an entry which pertains to arg. 169 * 170 * This is called from within the client upon completion of the callback. 171 * RETURN VALUES: 172 * AS_CALLBACK_DELETED (callback entry found and deleted) 173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 175 * entry will be made in as_do_callbacks) 176 * 177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 178 * set, it indicates that as_do_callbacks is processing this entry. The 179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 180 * to unblock as_do_callbacks, in case it is blocked. 181 * 182 * CALLERS RESPONSIBILITY: If not calling from within the process context for 183 * the specified as, the caller must guarantee persistence of the specified as 184 * for the duration of this function (eg. pages being locked within the as 185 * will guarantee persistence). 186 */ 187 uint_t 188 as_delete_callback(struct as *as, void *arg) 189 { 190 struct as_callback **prevcb = &as->a_callbacks; 191 struct as_callback *cb; 192 uint_t rc = AS_CALLBACK_NOTFOUND; 193 194 mutex_enter(&as->a_contents); 195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 196 if (cb->ascb_arg != arg) 197 continue; 198 199 /* 200 * If the events indicate AS_CALLBACK_CALLED, just clear 201 * AS_ALL_EVENT in the events field and wakeup the thread 202 * that may be waiting in as_do_callbacks. as_do_callbacks 203 * will take care of removing this entry from the list. In 204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 205 * (AS_CALLBACK_CALLED not set), just remove it from the 206 * list, return the memory and return AS_CALLBACK_DELETED. 207 */ 208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 209 /* leave AS_CALLBACK_CALLED */ 210 cb->ascb_events &= ~AS_ALL_EVENT; 211 rc = AS_CALLBACK_DELETE_DEFERRED; 212 cv_broadcast(&as->a_cv); 213 } else { 214 *prevcb = cb->ascb_next; 215 kmem_free(cb, sizeof (struct as_callback)); 216 rc = AS_CALLBACK_DELETED; 217 } 218 break; 219 } 220 mutex_exit(&as->a_contents); 221 return (rc); 222 } 223 224 /* 225 * Searches the as callback list for a matching entry. 226 * Returns a pointer to the first matching callback, or NULL if 227 * nothing is found. 228 * This function never sleeps so it is ok to call it with more 229 * locks held but the (required) a_contents mutex. 230 * 231 * See also comment on as_do_callbacks below. 232 */ 233 static struct as_callback * 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 235 size_t event_len) 236 { 237 struct as_callback *cb; 238 239 ASSERT(MUTEX_HELD(&as->a_contents)); 240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 241 /* 242 * If the callback has not already been called, then 243 * check if events or address range pertains. An event_len 244 * of zero means do an unconditional callback. 245 */ 246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 247 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 248 (event_addr + event_len < cb->ascb_saddr) || 249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 250 continue; 251 } 252 break; 253 } 254 return (cb); 255 } 256 257 /* 258 * Executes a given callback and removes it from the callback list for 259 * this address space. 260 * This function may sleep so the caller must drop all locks except 261 * a_contents before calling this func. 262 * 263 * See also comments on as_do_callbacks below. 264 */ 265 static void 266 as_execute_callback(struct as *as, struct as_callback *cb, 267 uint_t events) 268 { 269 struct as_callback **prevcb; 270 void *cb_arg; 271 272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 273 cb->ascb_events |= AS_CALLBACK_CALLED; 274 mutex_exit(&as->a_contents); 275 (*cb->ascb_func)(as, cb->ascb_arg, events); 276 mutex_enter(&as->a_contents); 277 /* 278 * the callback function is required to delete the callback 279 * when the callback function determines it is OK for 280 * this thread to continue. as_delete_callback will clear 281 * the AS_ALL_EVENT in the events field when it is deleted. 282 * If the callback function called as_delete_callback, 283 * events will already be cleared and there will be no blocking. 284 */ 285 while ((cb->ascb_events & events) != 0) { 286 cv_wait(&as->a_cv, &as->a_contents); 287 } 288 /* 289 * This entry needs to be taken off the list. Normally, the 290 * callback func itself does that, but unfortunately the list 291 * may have changed while the callback was running because the 292 * a_contents mutex was dropped and someone else other than the 293 * callback func itself could have called as_delete_callback, 294 * so we have to search to find this entry again. The entry 295 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 296 */ 297 cb_arg = cb->ascb_arg; 298 prevcb = &as->a_callbacks; 299 for (cb = as->a_callbacks; cb != NULL; 300 prevcb = &cb->ascb_next, cb = *prevcb) { 301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 302 (cb_arg != cb->ascb_arg)) { 303 continue; 304 } 305 *prevcb = cb->ascb_next; 306 kmem_free(cb, sizeof (struct as_callback)); 307 break; 308 } 309 } 310 311 /* 312 * Check the callback list for a matching event and intersection of 313 * address range. If there is a match invoke the callback. Skip an entry if: 314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 315 * - not event of interest 316 * - not address range of interest 317 * 318 * An event_len of zero indicates a request for an unconditional callback 319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 320 * a_contents lock must be dropped before a callback, so only one callback 321 * can be done before returning. Return -1 (true) if a callback was 322 * executed and removed from the list, else return 0 (false). 323 * 324 * The logically separate parts, i.e. finding a matching callback and 325 * executing a given callback have been separated into two functions 326 * so that they can be called with different sets of locks held beyond 327 * the always-required a_contents. as_find_callback does not sleep so 328 * it is ok to call it if more locks than a_contents (i.e. the a_lock 329 * rwlock) are held. as_execute_callback on the other hand may sleep 330 * so all locks beyond a_contents must be dropped by the caller if one 331 * does not want to end comatose. 332 */ 333 static int 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 335 size_t event_len) 336 { 337 struct as_callback *cb; 338 339 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 340 as_execute_callback(as, cb, events); 341 return (-1); 342 } 343 return (0); 344 } 345 346 /* 347 * Search for the segment containing addr. If a segment containing addr 348 * exists, that segment is returned. If no such segment exists, and 349 * the list spans addresses greater than addr, then the first segment 350 * whose base is greater than addr is returned; otherwise, NULL is 351 * returned unless tail is true, in which case the last element of the 352 * list is returned. 353 * 354 * a_seglast is used to cache the last found segment for repeated 355 * searches to the same addr (which happens frequently). 356 */ 357 struct seg * 358 as_findseg(struct as *as, caddr_t addr, int tail) 359 { 360 struct seg *seg = as->a_seglast; 361 avl_index_t where; 362 363 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 364 365 if (seg != NULL && 366 seg->s_base <= addr && 367 addr < seg->s_base + seg->s_size) 368 return (seg); 369 370 seg = avl_find(&as->a_segtree, &addr, &where); 371 if (seg != NULL) 372 return (as->a_seglast = seg); 373 374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 375 if (seg == NULL && tail) 376 seg = avl_last(&as->a_segtree); 377 return (as->a_seglast = seg); 378 } 379 380 #ifdef VERIFY_SEGLIST 381 /* 382 * verify that the linked list is coherent 383 */ 384 static void 385 as_verify(struct as *as) 386 { 387 struct seg *seg, *seglast, *p, *n; 388 uint_t nsegs = 0; 389 390 if (do_as_verify == 0) 391 return; 392 393 seglast = as->a_seglast; 394 395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 396 ASSERT(seg->s_as == as); 397 p = AS_SEGPREV(as, seg); 398 n = AS_SEGNEXT(as, seg); 399 ASSERT(p == NULL || p->s_as == as); 400 ASSERT(p == NULL || p->s_base < seg->s_base); 401 ASSERT(n == NULL || n->s_base > seg->s_base); 402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 403 if (seg == seglast) 404 seglast = NULL; 405 nsegs++; 406 } 407 ASSERT(seglast == NULL); 408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 409 } 410 #endif /* VERIFY_SEGLIST */ 411 412 /* 413 * Add a new segment to the address space. The avl_find() 414 * may be expensive so we attempt to use last segment accessed 415 * in as_gap() as an insertion point. 416 */ 417 int 418 as_addseg(struct as *as, struct seg *newseg) 419 { 420 struct seg *seg; 421 caddr_t addr; 422 caddr_t eaddr; 423 avl_index_t where; 424 425 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 426 427 as->a_updatedir = 1; /* inform /proc */ 428 gethrestime(&as->a_updatetime); 429 430 if (as->a_lastgaphl != NULL) { 431 struct seg *hseg = NULL; 432 struct seg *lseg = NULL; 433 434 if (as->a_lastgaphl->s_base > newseg->s_base) { 435 hseg = as->a_lastgaphl; 436 lseg = AVL_PREV(&as->a_segtree, hseg); 437 } else { 438 lseg = as->a_lastgaphl; 439 hseg = AVL_NEXT(&as->a_segtree, lseg); 440 } 441 442 if (hseg && lseg && lseg->s_base < newseg->s_base && 443 hseg->s_base > newseg->s_base) { 444 avl_insert_here(&as->a_segtree, newseg, lseg, 445 AVL_AFTER); 446 as->a_lastgaphl = NULL; 447 as->a_seglast = newseg; 448 return (0); 449 } 450 as->a_lastgaphl = NULL; 451 } 452 453 addr = newseg->s_base; 454 eaddr = addr + newseg->s_size; 455 again: 456 457 seg = avl_find(&as->a_segtree, &addr, &where); 458 459 if (seg == NULL) 460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 461 462 if (seg == NULL) 463 seg = avl_last(&as->a_segtree); 464 465 if (seg != NULL) { 466 caddr_t base = seg->s_base; 467 468 /* 469 * If top of seg is below the requested address, then 470 * the insertion point is at the end of the linked list, 471 * and seg points to the tail of the list. Otherwise, 472 * the insertion point is immediately before seg. 473 */ 474 if (base + seg->s_size > addr) { 475 if (addr >= base || eaddr > base) { 476 #ifdef __sparc 477 extern const struct seg_ops segnf_ops; 478 479 /* 480 * no-fault segs must disappear if overlaid. 481 * XXX need new segment type so 482 * we don't have to check s_ops 483 */ 484 if (seg->s_ops == &segnf_ops) { 485 seg_unmap(seg); 486 goto again; 487 } 488 #endif 489 return (-1); /* overlapping segment */ 490 } 491 } 492 } 493 as->a_seglast = newseg; 494 avl_insert(&as->a_segtree, newseg, where); 495 496 #ifdef VERIFY_SEGLIST 497 as_verify(as); 498 #endif 499 return (0); 500 } 501 502 struct seg * 503 as_removeseg(struct as *as, struct seg *seg) 504 { 505 avl_tree_t *t; 506 507 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 508 509 as->a_updatedir = 1; /* inform /proc */ 510 gethrestime(&as->a_updatetime); 511 512 if (seg == NULL) 513 return (NULL); 514 515 t = &as->a_segtree; 516 if (as->a_seglast == seg) 517 as->a_seglast = NULL; 518 as->a_lastgaphl = NULL; 519 520 /* 521 * if this segment is at an address higher than 522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 523 */ 524 if (as->a_lastgap && 525 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 526 as->a_lastgap = AVL_NEXT(t, seg); 527 528 /* 529 * remove the segment from the seg tree 530 */ 531 avl_remove(t, seg); 532 533 #ifdef VERIFY_SEGLIST 534 as_verify(as); 535 #endif 536 return (seg); 537 } 538 539 /* 540 * Find a segment containing addr. 541 */ 542 struct seg * 543 as_segat(struct as *as, caddr_t addr) 544 { 545 struct seg *seg = as->a_seglast; 546 547 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 548 549 if (seg != NULL && seg->s_base <= addr && 550 addr < seg->s_base + seg->s_size) 551 return (seg); 552 553 seg = avl_find(&as->a_segtree, &addr, NULL); 554 return (seg); 555 } 556 557 /* 558 * Serialize all searches for holes in an address space to 559 * prevent two or more threads from allocating the same virtual 560 * address range. The address space must not be "read/write" 561 * locked by the caller since we may block. 562 */ 563 void 564 as_rangelock(struct as *as) 565 { 566 mutex_enter(&as->a_contents); 567 while (AS_ISCLAIMGAP(as)) 568 cv_wait(&as->a_cv, &as->a_contents); 569 AS_SETCLAIMGAP(as); 570 mutex_exit(&as->a_contents); 571 } 572 573 /* 574 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 575 */ 576 void 577 as_rangeunlock(struct as *as) 578 { 579 mutex_enter(&as->a_contents); 580 AS_CLRCLAIMGAP(as); 581 cv_signal(&as->a_cv); 582 mutex_exit(&as->a_contents); 583 } 584 585 /* 586 * compar segments (or just an address) by segment address range 587 */ 588 static int 589 as_segcompar(const void *x, const void *y) 590 { 591 struct seg *a = (struct seg *)x; 592 struct seg *b = (struct seg *)y; 593 594 if (a->s_base < b->s_base) 595 return (-1); 596 if (a->s_base >= b->s_base + b->s_size) 597 return (1); 598 return (0); 599 } 600 601 602 void 603 as_avlinit(struct as *as) 604 { 605 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 606 offsetof(struct seg, s_tree)); 607 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 608 offsetof(struct watched_page, wp_link)); 609 } 610 611 /*ARGSUSED*/ 612 static int 613 as_constructor(void *buf, void *cdrarg, int kmflags) 614 { 615 struct as *as = buf; 616 617 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 618 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 619 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 620 as_avlinit(as); 621 return (0); 622 } 623 624 /*ARGSUSED1*/ 625 static void 626 as_destructor(void *buf, void *cdrarg) 627 { 628 struct as *as = buf; 629 630 avl_destroy(&as->a_segtree); 631 mutex_destroy(&as->a_contents); 632 cv_destroy(&as->a_cv); 633 rw_destroy(&as->a_lock); 634 } 635 636 void 637 as_init(void) 638 { 639 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 640 as_constructor, as_destructor, NULL, NULL, NULL, 0); 641 } 642 643 /* 644 * Allocate and initialize an address space data structure. 645 * We call hat_alloc to allow any machine dependent 646 * information in the hat structure to be initialized. 647 */ 648 struct as * 649 as_alloc(void) 650 { 651 struct as *as; 652 653 as = kmem_cache_alloc(as_cache, KM_SLEEP); 654 655 as->a_flags = 0; 656 as->a_vbits = 0; 657 as->a_hrm = NULL; 658 as->a_seglast = NULL; 659 as->a_size = 0; 660 as->a_resvsize = 0; 661 as->a_updatedir = 0; 662 gethrestime(&as->a_updatetime); 663 as->a_objectdir = NULL; 664 as->a_sizedir = 0; 665 as->a_userlimit = (caddr_t)USERLIMIT; 666 as->a_lastgap = NULL; 667 as->a_lastgaphl = NULL; 668 as->a_callbacks = NULL; 669 670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 672 AS_LOCK_EXIT(as, &as->a_lock); 673 674 as->a_xhat = NULL; 675 676 return (as); 677 } 678 679 /* 680 * Free an address space data structure. 681 * Need to free the hat first and then 682 * all the segments on this as and finally 683 * the space for the as struct itself. 684 */ 685 void 686 as_free(struct as *as) 687 { 688 struct hat *hat = as->a_hat; 689 struct seg *seg, *next; 690 int called = 0; 691 692 top: 693 /* 694 * Invoke ALL callbacks. as_do_callbacks will do one callback 695 * per call, and not return (-1) until the callback has completed. 696 * When as_do_callbacks returns zero, all callbacks have completed. 697 */ 698 mutex_enter(&as->a_contents); 699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 700 ; 701 702 /* This will prevent new XHATs from attaching to as */ 703 if (!called) 704 AS_SETBUSY(as); 705 mutex_exit(&as->a_contents); 706 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 707 708 if (!called) { 709 called = 1; 710 hat_free_start(hat); 711 if (as->a_xhat != NULL) 712 xhat_free_start_all(as); 713 } 714 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 715 int err; 716 717 next = AS_SEGNEXT(as, seg); 718 retry: 719 err = segop_unmap(seg, seg->s_base, seg->s_size); 720 if (err == EAGAIN) { 721 mutex_enter(&as->a_contents); 722 if (as->a_callbacks) { 723 AS_LOCK_EXIT(as, &as->a_lock); 724 } else if (!AS_ISNOUNMAPWAIT(as)) { 725 /* 726 * Memory is currently locked. Wait for a 727 * cv_signal that it has been unlocked, then 728 * try the operation again. 729 */ 730 if (AS_ISUNMAPWAIT(as) == 0) 731 cv_broadcast(&as->a_cv); 732 AS_SETUNMAPWAIT(as); 733 AS_LOCK_EXIT(as, &as->a_lock); 734 while (AS_ISUNMAPWAIT(as)) 735 cv_wait(&as->a_cv, &as->a_contents); 736 } else { 737 /* 738 * We may have raced with 739 * segvn_reclaim()/segspt_reclaim(). In this 740 * case clean nounmapwait flag and retry since 741 * softlockcnt in this segment may be already 742 * 0. We don't drop as writer lock so our 743 * number of retries without sleeping should 744 * be very small. See segvn_reclaim() for 745 * more comments. 746 */ 747 AS_CLRNOUNMAPWAIT(as); 748 mutex_exit(&as->a_contents); 749 goto retry; 750 } 751 mutex_exit(&as->a_contents); 752 goto top; 753 } else { 754 /* 755 * We do not expect any other error return at this 756 * time. This is similar to an ASSERT in seg_unmap() 757 */ 758 ASSERT(err == 0); 759 } 760 } 761 hat_free_end(hat); 762 if (as->a_xhat != NULL) 763 xhat_free_end_all(as); 764 AS_LOCK_EXIT(as, &as->a_lock); 765 766 /* /proc stuff */ 767 ASSERT(avl_numnodes(&as->a_wpage) == 0); 768 if (as->a_objectdir) { 769 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 770 as->a_objectdir = NULL; 771 as->a_sizedir = 0; 772 } 773 774 /* 775 * Free the struct as back to kmem. Assert it has no segments. 776 */ 777 ASSERT(avl_numnodes(&as->a_segtree) == 0); 778 kmem_cache_free(as_cache, as); 779 } 780 781 int 782 as_dup(struct as *as, struct proc *forkedproc) 783 { 784 struct as *newas; 785 struct seg *seg, *newseg; 786 size_t purgesize = 0; 787 int error; 788 789 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 790 as_clearwatch(as); 791 newas = as_alloc(); 792 newas->a_userlimit = as->a_userlimit; 793 newas->a_proc = forkedproc; 794 795 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); 796 797 /* This will prevent new XHATs from attaching */ 798 mutex_enter(&as->a_contents); 799 AS_SETBUSY(as); 800 mutex_exit(&as->a_contents); 801 mutex_enter(&newas->a_contents); 802 AS_SETBUSY(newas); 803 mutex_exit(&newas->a_contents); 804 805 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 806 807 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 808 809 if (seg->s_flags & S_PURGE) { 810 purgesize += seg->s_size; 811 continue; 812 } 813 814 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 815 if (newseg == NULL) { 816 AS_LOCK_EXIT(newas, &newas->a_lock); 817 as_setwatch(as); 818 mutex_enter(&as->a_contents); 819 AS_CLRBUSY(as); 820 mutex_exit(&as->a_contents); 821 AS_LOCK_EXIT(as, &as->a_lock); 822 as_free(newas); 823 return (-1); 824 } 825 if ((error = segop_dup(seg, newseg)) != 0) { 826 /* 827 * We call seg_free() on the new seg 828 * because the segment is not set up 829 * completely; i.e. it has no ops. 830 */ 831 as_setwatch(as); 832 mutex_enter(&as->a_contents); 833 AS_CLRBUSY(as); 834 mutex_exit(&as->a_contents); 835 AS_LOCK_EXIT(as, &as->a_lock); 836 seg_free(newseg); 837 AS_LOCK_EXIT(newas, &newas->a_lock); 838 as_free(newas); 839 return (error); 840 } 841 newas->a_size += seg->s_size; 842 } 843 newas->a_resvsize = as->a_resvsize - purgesize; 844 845 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 846 if (as->a_xhat != NULL) 847 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); 848 849 mutex_enter(&newas->a_contents); 850 AS_CLRBUSY(newas); 851 mutex_exit(&newas->a_contents); 852 AS_LOCK_EXIT(newas, &newas->a_lock); 853 854 as_setwatch(as); 855 mutex_enter(&as->a_contents); 856 AS_CLRBUSY(as); 857 mutex_exit(&as->a_contents); 858 AS_LOCK_EXIT(as, &as->a_lock); 859 if (error != 0) { 860 as_free(newas); 861 return (error); 862 } 863 forkedproc->p_as = newas; 864 return (0); 865 } 866 867 /* 868 * Handle a ``fault'' at addr for size bytes. 869 */ 870 faultcode_t 871 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 872 enum fault_type type, enum seg_rw rw) 873 { 874 struct seg *seg; 875 caddr_t raddr; /* rounded down addr */ 876 size_t rsize; /* rounded up size */ 877 size_t ssize; 878 faultcode_t res = 0; 879 caddr_t addrsav; 880 struct seg *segsav; 881 int as_lock_held; 882 klwp_t *lwp = ttolwp(curthread); 883 int is_xhat = 0; 884 int holding_wpage = 0; 885 886 if (as->a_hat != hat) { 887 /* This must be an XHAT then */ 888 is_xhat = 1; 889 890 if ((type != F_INVAL) || (as == &kas)) 891 return (FC_NOSUPPORT); 892 } 893 894 retry: 895 if (!is_xhat) { 896 /* 897 * Indicate that the lwp is not to be stopped while waiting 898 * for a pagefault. This is to avoid deadlock while debugging 899 * a process via /proc over NFS (in particular). 900 */ 901 if (lwp != NULL) 902 lwp->lwp_nostop++; 903 904 /* 905 * same length must be used when we softlock and softunlock. 906 * We don't support softunlocking lengths less than 907 * the original length when there is largepage support. 908 * See seg_dev.c for more comments. 909 */ 910 switch (type) { 911 912 case F_SOFTLOCK: 913 CPU_STATS_ADD_K(vm, softlock, 1); 914 break; 915 916 case F_SOFTUNLOCK: 917 break; 918 919 case F_PROT: 920 CPU_STATS_ADD_K(vm, prot_fault, 1); 921 break; 922 923 case F_INVAL: 924 CPU_STATS_ENTER_K(); 925 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 926 if (as == &kas) 927 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 928 CPU_STATS_EXIT_K(); 929 break; 930 } 931 } 932 933 /* Kernel probe */ 934 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 935 tnf_opaque, address, addr, 936 tnf_fault_type, fault_type, type, 937 tnf_seg_access, access, rw); 938 939 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 940 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 941 (size_t)raddr; 942 943 /* 944 * XXX -- Don't grab the as lock for segkmap. We should grab it for 945 * correctness, but then we could be stuck holding this lock for 946 * a LONG time if the fault needs to be resolved on a slow 947 * filesystem, and then no-one will be able to exec new commands, 948 * as exec'ing requires the write lock on the as. 949 */ 950 if (as == &kas && segkmap && segkmap->s_base <= raddr && 951 raddr + size < segkmap->s_base + segkmap->s_size) { 952 /* 953 * if (as==&kas), this can't be XHAT: we've already returned 954 * FC_NOSUPPORT. 955 */ 956 seg = segkmap; 957 as_lock_held = 0; 958 } else { 959 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 960 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { 961 /* 962 * Grab and hold the writers' lock on the as 963 * if the fault is to a watched page. 964 * This will keep CPUs from "peeking" at the 965 * address range while we're temporarily boosting 966 * the permissions for the XHAT device to 967 * resolve the fault in the segment layer. 968 * 969 * We could check whether faulted address 970 * is within a watched page and only then grab 971 * the writer lock, but this is simpler. 972 */ 973 AS_LOCK_EXIT(as, &as->a_lock); 974 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 975 } 976 977 seg = as_segat(as, raddr); 978 if (seg == NULL) { 979 AS_LOCK_EXIT(as, &as->a_lock); 980 if ((lwp != NULL) && (!is_xhat)) 981 lwp->lwp_nostop--; 982 return (FC_NOMAP); 983 } 984 985 as_lock_held = 1; 986 } 987 988 addrsav = raddr; 989 segsav = seg; 990 991 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 992 if (raddr >= seg->s_base + seg->s_size) { 993 seg = AS_SEGNEXT(as, seg); 994 if (seg == NULL || raddr != seg->s_base) { 995 res = FC_NOMAP; 996 break; 997 } 998 } 999 if (raddr + rsize > seg->s_base + seg->s_size) 1000 ssize = seg->s_base + seg->s_size - raddr; 1001 else 1002 ssize = rsize; 1003 1004 if (!is_xhat || (seg->s_ops != &segdev_ops)) { 1005 1006 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && 1007 pr_is_watchpage_as(raddr, rw, as)) { 1008 /* 1009 * Handle watch pages. If we're faulting on a 1010 * watched page from an X-hat, we have to 1011 * restore the original permissions while we 1012 * handle the fault. 1013 */ 1014 as_clearwatch(as); 1015 holding_wpage = 1; 1016 } 1017 1018 res = segop_fault(hat, seg, raddr, ssize, type, rw); 1019 1020 /* Restore watchpoints */ 1021 if (holding_wpage) { 1022 as_setwatch(as); 1023 holding_wpage = 0; 1024 } 1025 1026 if (res != 0) 1027 break; 1028 } else { 1029 /* XHAT does not support seg_dev */ 1030 res = FC_NOSUPPORT; 1031 break; 1032 } 1033 } 1034 1035 /* 1036 * If we were SOFTLOCKing and encountered a failure, 1037 * we must SOFTUNLOCK the range we already did. (Maybe we 1038 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 1039 * right here...) 1040 */ 1041 if (res != 0 && type == F_SOFTLOCK) { 1042 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 1043 if (addrsav >= seg->s_base + seg->s_size) 1044 seg = AS_SEGNEXT(as, seg); 1045 ASSERT(seg != NULL); 1046 /* 1047 * Now call the fault routine again to perform the 1048 * unlock using S_OTHER instead of the rw variable 1049 * since we never got a chance to touch the pages. 1050 */ 1051 if (raddr > seg->s_base + seg->s_size) 1052 ssize = seg->s_base + seg->s_size - addrsav; 1053 else 1054 ssize = raddr - addrsav; 1055 (void) segop_fault(hat, seg, addrsav, ssize, 1056 F_SOFTUNLOCK, S_OTHER); 1057 } 1058 } 1059 if (as_lock_held) 1060 AS_LOCK_EXIT(as, &as->a_lock); 1061 if ((lwp != NULL) && (!is_xhat)) 1062 lwp->lwp_nostop--; 1063 1064 /* 1065 * If the lower levels returned EDEADLK for a fault, 1066 * It means that we should retry the fault. Let's wait 1067 * a bit also to let the deadlock causing condition clear. 1068 * This is part of a gross hack to work around a design flaw 1069 * in the ufs/sds logging code and should go away when the 1070 * logging code is re-designed to fix the problem. See bug 1071 * 4125102 for details of the problem. 1072 */ 1073 if (FC_ERRNO(res) == EDEADLK) { 1074 delay(deadlk_wait); 1075 res = 0; 1076 goto retry; 1077 } 1078 return (res); 1079 } 1080 1081 1082 1083 /* 1084 * Asynchronous ``fault'' at addr for size bytes. 1085 */ 1086 faultcode_t 1087 as_faulta(struct as *as, caddr_t addr, size_t size) 1088 { 1089 struct seg *seg; 1090 caddr_t raddr; /* rounded down addr */ 1091 size_t rsize; /* rounded up size */ 1092 faultcode_t res = 0; 1093 klwp_t *lwp = ttolwp(curthread); 1094 1095 retry: 1096 /* 1097 * Indicate that the lwp is not to be stopped while waiting 1098 * for a pagefault. This is to avoid deadlock while debugging 1099 * a process via /proc over NFS (in particular). 1100 */ 1101 if (lwp != NULL) 1102 lwp->lwp_nostop++; 1103 1104 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1105 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1106 (size_t)raddr; 1107 1108 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1109 seg = as_segat(as, raddr); 1110 if (seg == NULL) { 1111 AS_LOCK_EXIT(as, &as->a_lock); 1112 if (lwp != NULL) 1113 lwp->lwp_nostop--; 1114 return (FC_NOMAP); 1115 } 1116 1117 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1118 if (raddr >= seg->s_base + seg->s_size) { 1119 seg = AS_SEGNEXT(as, seg); 1120 if (seg == NULL || raddr != seg->s_base) { 1121 res = FC_NOMAP; 1122 break; 1123 } 1124 } 1125 res = segop_faulta(seg, raddr); 1126 if (res != 0) 1127 break; 1128 } 1129 AS_LOCK_EXIT(as, &as->a_lock); 1130 if (lwp != NULL) 1131 lwp->lwp_nostop--; 1132 /* 1133 * If the lower levels returned EDEADLK for a fault, 1134 * It means that we should retry the fault. Let's wait 1135 * a bit also to let the deadlock causing condition clear. 1136 * This is part of a gross hack to work around a design flaw 1137 * in the ufs/sds logging code and should go away when the 1138 * logging code is re-designed to fix the problem. See bug 1139 * 4125102 for details of the problem. 1140 */ 1141 if (FC_ERRNO(res) == EDEADLK) { 1142 delay(deadlk_wait); 1143 res = 0; 1144 goto retry; 1145 } 1146 return (res); 1147 } 1148 1149 /* 1150 * Set the virtual mapping for the interval from [addr : addr + size) 1151 * in address space `as' to have the specified protection. 1152 * It is ok for the range to cross over several segments, 1153 * as long as they are contiguous. 1154 */ 1155 int 1156 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1157 { 1158 struct seg *seg; 1159 struct as_callback *cb; 1160 size_t ssize; 1161 caddr_t raddr; /* rounded down addr */ 1162 size_t rsize; /* rounded up size */ 1163 int error = 0, writer = 0; 1164 caddr_t saveraddr; 1165 size_t saversize; 1166 1167 setprot_top: 1168 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1169 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1170 (size_t)raddr; 1171 1172 if (raddr + rsize < raddr) /* check for wraparound */ 1173 return (ENOMEM); 1174 1175 saveraddr = raddr; 1176 saversize = rsize; 1177 1178 /* 1179 * Normally we only lock the as as a reader. But 1180 * if due to setprot the segment driver needs to split 1181 * a segment it will return IE_RETRY. Therefore we re-acquire 1182 * the as lock as a writer so the segment driver can change 1183 * the seg list. Also the segment driver will return IE_RETRY 1184 * after it has changed the segment list so we therefore keep 1185 * locking as a writer. Since these opeartions should be rare 1186 * want to only lock as a writer when necessary. 1187 */ 1188 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1189 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1190 } else { 1191 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1192 } 1193 1194 as_clearwatchprot(as, raddr, rsize); 1195 seg = as_segat(as, raddr); 1196 if (seg == NULL) { 1197 as_setwatch(as); 1198 AS_LOCK_EXIT(as, &as->a_lock); 1199 return (ENOMEM); 1200 } 1201 1202 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1203 if (raddr >= seg->s_base + seg->s_size) { 1204 seg = AS_SEGNEXT(as, seg); 1205 if (seg == NULL || raddr != seg->s_base) { 1206 error = ENOMEM; 1207 break; 1208 } 1209 } 1210 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1211 ssize = seg->s_base + seg->s_size - raddr; 1212 else 1213 ssize = rsize; 1214 retry: 1215 error = segop_setprot(seg, raddr, ssize, prot); 1216 1217 if (error == IE_NOMEM) { 1218 error = EAGAIN; 1219 break; 1220 } 1221 1222 if (error == IE_RETRY) { 1223 AS_LOCK_EXIT(as, &as->a_lock); 1224 writer = 1; 1225 goto setprot_top; 1226 } 1227 1228 if (error == EAGAIN) { 1229 /* 1230 * Make sure we have a_lock as writer. 1231 */ 1232 if (writer == 0) { 1233 AS_LOCK_EXIT(as, &as->a_lock); 1234 writer = 1; 1235 goto setprot_top; 1236 } 1237 1238 /* 1239 * Memory is currently locked. It must be unlocked 1240 * before this operation can succeed through a retry. 1241 * The possible reasons for locked memory and 1242 * corresponding strategies for unlocking are: 1243 * (1) Normal I/O 1244 * wait for a signal that the I/O operation 1245 * has completed and the memory is unlocked. 1246 * (2) Asynchronous I/O 1247 * The aio subsystem does not unlock pages when 1248 * the I/O is completed. Those pages are unlocked 1249 * when the application calls aiowait/aioerror. 1250 * So, to prevent blocking forever, cv_broadcast() 1251 * is done to wake up aio_cleanup_thread. 1252 * Subsequently, segvn_reclaim will be called, and 1253 * that will do AS_CLRUNMAPWAIT() and wake us up. 1254 * (3) Long term page locking: 1255 * Drivers intending to have pages locked for a 1256 * period considerably longer than for normal I/O 1257 * (essentially forever) may have registered for a 1258 * callback so they may unlock these pages on 1259 * request. This is needed to allow this operation 1260 * to succeed. Each entry on the callback list is 1261 * examined. If the event or address range pertains 1262 * the callback is invoked (unless it already is in 1263 * progress). The a_contents lock must be dropped 1264 * before the callback, so only one callback can 1265 * be done at a time. Go to the top and do more 1266 * until zero is returned. If zero is returned, 1267 * either there were no callbacks for this event 1268 * or they were already in progress. 1269 */ 1270 mutex_enter(&as->a_contents); 1271 if (as->a_callbacks && 1272 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1273 seg->s_base, seg->s_size))) { 1274 AS_LOCK_EXIT(as, &as->a_lock); 1275 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1276 } else if (!AS_ISNOUNMAPWAIT(as)) { 1277 if (AS_ISUNMAPWAIT(as) == 0) 1278 cv_broadcast(&as->a_cv); 1279 AS_SETUNMAPWAIT(as); 1280 AS_LOCK_EXIT(as, &as->a_lock); 1281 while (AS_ISUNMAPWAIT(as)) 1282 cv_wait(&as->a_cv, &as->a_contents); 1283 } else { 1284 /* 1285 * We may have raced with 1286 * segvn_reclaim()/segspt_reclaim(). In this 1287 * case clean nounmapwait flag and retry since 1288 * softlockcnt in this segment may be already 1289 * 0. We don't drop as writer lock so our 1290 * number of retries without sleeping should 1291 * be very small. See segvn_reclaim() for 1292 * more comments. 1293 */ 1294 AS_CLRNOUNMAPWAIT(as); 1295 mutex_exit(&as->a_contents); 1296 goto retry; 1297 } 1298 mutex_exit(&as->a_contents); 1299 goto setprot_top; 1300 } else if (error != 0) 1301 break; 1302 } 1303 if (error != 0) { 1304 as_setwatch(as); 1305 } else { 1306 as_setwatchprot(as, saveraddr, saversize, prot); 1307 } 1308 AS_LOCK_EXIT(as, &as->a_lock); 1309 return (error); 1310 } 1311 1312 /* 1313 * Check to make sure that the interval [addr, addr + size) 1314 * in address space `as' has at least the specified protection. 1315 * It is ok for the range to cross over several segments, as long 1316 * as they are contiguous. 1317 */ 1318 int 1319 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1320 { 1321 struct seg *seg; 1322 size_t ssize; 1323 caddr_t raddr; /* rounded down addr */ 1324 size_t rsize; /* rounded up size */ 1325 int error = 0; 1326 1327 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1328 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1329 (size_t)raddr; 1330 1331 if (raddr + rsize < raddr) /* check for wraparound */ 1332 return (ENOMEM); 1333 1334 /* 1335 * This is ugly as sin... 1336 * Normally, we only acquire the address space readers lock. 1337 * However, if the address space has watchpoints present, 1338 * we must acquire the writer lock on the address space for 1339 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1340 */ 1341 if (avl_numnodes(&as->a_wpage) != 0) 1342 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1343 else 1344 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1345 as_clearwatchprot(as, raddr, rsize); 1346 seg = as_segat(as, raddr); 1347 if (seg == NULL) { 1348 as_setwatch(as); 1349 AS_LOCK_EXIT(as, &as->a_lock); 1350 return (ENOMEM); 1351 } 1352 1353 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1354 if (raddr >= seg->s_base + seg->s_size) { 1355 seg = AS_SEGNEXT(as, seg); 1356 if (seg == NULL || raddr != seg->s_base) { 1357 error = ENOMEM; 1358 break; 1359 } 1360 } 1361 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1362 ssize = seg->s_base + seg->s_size - raddr; 1363 else 1364 ssize = rsize; 1365 1366 error = segop_checkprot(seg, raddr, ssize, prot); 1367 if (error != 0) 1368 break; 1369 } 1370 as_setwatch(as); 1371 AS_LOCK_EXIT(as, &as->a_lock); 1372 return (error); 1373 } 1374 1375 int 1376 as_unmap(struct as *as, caddr_t addr, size_t size) 1377 { 1378 struct seg *seg, *seg_next; 1379 struct as_callback *cb; 1380 caddr_t raddr, eaddr; 1381 size_t ssize, rsize = 0; 1382 int err; 1383 1384 top: 1385 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1386 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1387 (uintptr_t)PAGEMASK); 1388 1389 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1390 1391 as->a_updatedir = 1; /* inform /proc */ 1392 gethrestime(&as->a_updatetime); 1393 1394 /* 1395 * Use as_findseg to find the first segment in the range, then 1396 * step through the segments in order, following s_next. 1397 */ 1398 as_clearwatchprot(as, raddr, eaddr - raddr); 1399 1400 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1401 if (eaddr <= seg->s_base) 1402 break; /* eaddr was in a gap; all done */ 1403 1404 /* this is implied by the test above */ 1405 ASSERT(raddr < eaddr); 1406 1407 if (raddr < seg->s_base) 1408 raddr = seg->s_base; /* raddr was in a gap */ 1409 1410 if (eaddr > (seg->s_base + seg->s_size)) 1411 ssize = seg->s_base + seg->s_size - raddr; 1412 else 1413 ssize = eaddr - raddr; 1414 1415 /* 1416 * Save next segment pointer since seg can be 1417 * destroyed during the segment unmap operation. 1418 */ 1419 seg_next = AS_SEGNEXT(as, seg); 1420 1421 /* 1422 * We didn't count /dev/null mappings, so ignore them here. 1423 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again, 1424 * we have to do this check here while we have seg.) 1425 */ 1426 rsize = 0; 1427 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1428 !SEG_IS_PARTIAL_RESV(seg)) 1429 rsize = ssize; 1430 1431 retry: 1432 err = segop_unmap(seg, raddr, ssize); 1433 if (err == EAGAIN) { 1434 /* 1435 * Memory is currently locked. It must be unlocked 1436 * before this operation can succeed through a retry. 1437 * The possible reasons for locked memory and 1438 * corresponding strategies for unlocking are: 1439 * (1) Normal I/O 1440 * wait for a signal that the I/O operation 1441 * has completed and the memory is unlocked. 1442 * (2) Asynchronous I/O 1443 * The aio subsystem does not unlock pages when 1444 * the I/O is completed. Those pages are unlocked 1445 * when the application calls aiowait/aioerror. 1446 * So, to prevent blocking forever, cv_broadcast() 1447 * is done to wake up aio_cleanup_thread. 1448 * Subsequently, segvn_reclaim will be called, and 1449 * that will do AS_CLRUNMAPWAIT() and wake us up. 1450 * (3) Long term page locking: 1451 * Drivers intending to have pages locked for a 1452 * period considerably longer than for normal I/O 1453 * (essentially forever) may have registered for a 1454 * callback so they may unlock these pages on 1455 * request. This is needed to allow this operation 1456 * to succeed. Each entry on the callback list is 1457 * examined. If the event or address range pertains 1458 * the callback is invoked (unless it already is in 1459 * progress). The a_contents lock must be dropped 1460 * before the callback, so only one callback can 1461 * be done at a time. Go to the top and do more 1462 * until zero is returned. If zero is returned, 1463 * either there were no callbacks for this event 1464 * or they were already in progress. 1465 */ 1466 mutex_enter(&as->a_contents); 1467 if (as->a_callbacks && 1468 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1469 seg->s_base, seg->s_size))) { 1470 AS_LOCK_EXIT(as, &as->a_lock); 1471 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1472 } else if (!AS_ISNOUNMAPWAIT(as)) { 1473 if (AS_ISUNMAPWAIT(as) == 0) 1474 cv_broadcast(&as->a_cv); 1475 AS_SETUNMAPWAIT(as); 1476 AS_LOCK_EXIT(as, &as->a_lock); 1477 while (AS_ISUNMAPWAIT(as)) 1478 cv_wait(&as->a_cv, &as->a_contents); 1479 } else { 1480 /* 1481 * We may have raced with 1482 * segvn_reclaim()/segspt_reclaim(). In this 1483 * case clean nounmapwait flag and retry since 1484 * softlockcnt in this segment may be already 1485 * 0. We don't drop as writer lock so our 1486 * number of retries without sleeping should 1487 * be very small. See segvn_reclaim() for 1488 * more comments. 1489 */ 1490 AS_CLRNOUNMAPWAIT(as); 1491 mutex_exit(&as->a_contents); 1492 goto retry; 1493 } 1494 mutex_exit(&as->a_contents); 1495 goto top; 1496 } else if (err == IE_RETRY) { 1497 AS_LOCK_EXIT(as, &as->a_lock); 1498 goto top; 1499 } else if (err) { 1500 as_setwatch(as); 1501 AS_LOCK_EXIT(as, &as->a_lock); 1502 return (-1); 1503 } 1504 1505 as->a_size -= ssize; 1506 if (rsize) 1507 as->a_resvsize -= rsize; 1508 raddr += ssize; 1509 } 1510 AS_LOCK_EXIT(as, &as->a_lock); 1511 return (0); 1512 } 1513 1514 static int 1515 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1516 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1517 { 1518 uint_t szc; 1519 uint_t nszc; 1520 int error; 1521 caddr_t a; 1522 caddr_t eaddr; 1523 size_t segsize; 1524 struct seg *seg; 1525 size_t pgsz; 1526 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1527 uint_t save_szcvec; 1528 1529 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1530 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1531 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1532 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1533 if (!do_off) { 1534 vn_a->offset = 0; 1535 } 1536 1537 if (szcvec <= 1) { 1538 seg = seg_alloc(as, addr, size); 1539 if (seg == NULL) { 1540 return (ENOMEM); 1541 } 1542 vn_a->szc = 0; 1543 error = (*crfp)(seg, vn_a); 1544 if (error != 0) { 1545 seg_free(seg); 1546 } else { 1547 as->a_size += size; 1548 as->a_resvsize += size; 1549 } 1550 return (error); 1551 } 1552 1553 eaddr = addr + size; 1554 save_szcvec = szcvec; 1555 szcvec >>= 1; 1556 szc = 0; 1557 nszc = 0; 1558 while (szcvec) { 1559 if ((szcvec & 0x1) == 0) { 1560 nszc++; 1561 szcvec >>= 1; 1562 continue; 1563 } 1564 nszc++; 1565 pgsz = page_get_pagesize(nszc); 1566 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1567 if (a != addr) { 1568 ASSERT(a < eaddr); 1569 segsize = a - addr; 1570 seg = seg_alloc(as, addr, segsize); 1571 if (seg == NULL) { 1572 return (ENOMEM); 1573 } 1574 vn_a->szc = szc; 1575 error = (*crfp)(seg, vn_a); 1576 if (error != 0) { 1577 seg_free(seg); 1578 return (error); 1579 } 1580 as->a_size += segsize; 1581 as->a_resvsize += segsize; 1582 *segcreated = 1; 1583 if (do_off) { 1584 vn_a->offset += segsize; 1585 } 1586 addr = a; 1587 } 1588 szc = nszc; 1589 szcvec >>= 1; 1590 } 1591 1592 ASSERT(addr < eaddr); 1593 szcvec = save_szcvec | 1; /* add 8K pages */ 1594 while (szcvec) { 1595 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1596 ASSERT(a >= addr); 1597 if (a != addr) { 1598 segsize = a - addr; 1599 seg = seg_alloc(as, addr, segsize); 1600 if (seg == NULL) { 1601 return (ENOMEM); 1602 } 1603 vn_a->szc = szc; 1604 error = (*crfp)(seg, vn_a); 1605 if (error != 0) { 1606 seg_free(seg); 1607 return (error); 1608 } 1609 as->a_size += segsize; 1610 as->a_resvsize += segsize; 1611 *segcreated = 1; 1612 if (do_off) { 1613 vn_a->offset += segsize; 1614 } 1615 addr = a; 1616 } 1617 szcvec &= ~(1 << szc); 1618 if (szcvec) { 1619 szc = highbit(szcvec) - 1; 1620 pgsz = page_get_pagesize(szc); 1621 } 1622 } 1623 ASSERT(addr == eaddr); 1624 1625 return (0); 1626 } 1627 1628 static int 1629 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1630 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1631 { 1632 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1633 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1634 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1635 type, 0); 1636 int error; 1637 struct seg *seg; 1638 struct vattr va; 1639 u_offset_t eoff; 1640 size_t save_size = 0; 1641 extern size_t textrepl_size_thresh; 1642 1643 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1644 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1645 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1646 ASSERT(vn_a->vp != NULL); 1647 ASSERT(vn_a->amp == NULL); 1648 1649 again: 1650 if (szcvec <= 1) { 1651 seg = seg_alloc(as, addr, size); 1652 if (seg == NULL) { 1653 return (ENOMEM); 1654 } 1655 vn_a->szc = 0; 1656 error = (*crfp)(seg, vn_a); 1657 if (error != 0) { 1658 seg_free(seg); 1659 } else { 1660 as->a_size += size; 1661 as->a_resvsize += size; 1662 } 1663 return (error); 1664 } 1665 1666 va.va_mask = AT_SIZE; 1667 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1668 szcvec = 0; 1669 goto again; 1670 } 1671 eoff = vn_a->offset & PAGEMASK; 1672 if (eoff >= va.va_size) { 1673 szcvec = 0; 1674 goto again; 1675 } 1676 eoff += size; 1677 if (btopr(va.va_size) < btopr(eoff)) { 1678 save_size = size; 1679 size = va.va_size - (vn_a->offset & PAGEMASK); 1680 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1681 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1682 type, 0); 1683 if (szcvec <= 1) { 1684 size = save_size; 1685 goto again; 1686 } 1687 } 1688 1689 if (size > textrepl_size_thresh) { 1690 vn_a->flags |= _MAP_TEXTREPL; 1691 } 1692 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1693 segcreated); 1694 if (error != 0) { 1695 return (error); 1696 } 1697 if (save_size) { 1698 addr += size; 1699 size = save_size - size; 1700 szcvec = 0; 1701 goto again; 1702 } 1703 return (0); 1704 } 1705 1706 /* 1707 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1708 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1709 */ 1710 static int 1711 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1712 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1713 { 1714 uint_t szcvec; 1715 uchar_t type; 1716 1717 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1718 if (vn_a->type == MAP_SHARED) { 1719 type = MAPPGSZC_SHM; 1720 } else if (vn_a->type == MAP_PRIVATE) { 1721 if (vn_a->szc == AS_MAP_HEAP) { 1722 type = MAPPGSZC_HEAP; 1723 } else if (vn_a->szc == AS_MAP_STACK) { 1724 type = MAPPGSZC_STACK; 1725 } else { 1726 type = MAPPGSZC_PRIVM; 1727 } 1728 } 1729 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1730 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1731 (vn_a->flags & MAP_TEXT), type, 0); 1732 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1733 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1734 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1735 ASSERT(vn_a->vp == NULL); 1736 1737 return (as_map_segvn_segs(as, addr, size, szcvec, 1738 crfp, vn_a, segcreated)); 1739 } 1740 1741 int 1742 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) 1743 { 1744 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1745 return (as_map_locked(as, addr, size, crfp, argsp)); 1746 } 1747 1748 int 1749 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), 1750 void *argsp) 1751 { 1752 struct seg *seg = NULL; 1753 caddr_t raddr; /* rounded down addr */ 1754 size_t rsize; /* rounded up size */ 1755 int error; 1756 int unmap = 0; 1757 struct proc *p = curproc; 1758 struct segvn_crargs crargs; 1759 1760 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1761 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1762 (size_t)raddr; 1763 1764 /* 1765 * check for wrap around 1766 */ 1767 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1768 AS_LOCK_EXIT(as, &as->a_lock); 1769 return (ENOMEM); 1770 } 1771 1772 as->a_updatedir = 1; /* inform /proc */ 1773 gethrestime(&as->a_updatetime); 1774 1775 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { 1776 AS_LOCK_EXIT(as, &as->a_lock); 1777 1778 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, 1779 RCA_UNSAFE_ALL); 1780 1781 return (ENOMEM); 1782 } 1783 1784 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1785 crargs = *(struct segvn_crargs *)argsp; 1786 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap); 1787 if (error != 0) { 1788 AS_LOCK_EXIT(as, &as->a_lock); 1789 if (unmap) { 1790 (void) as_unmap(as, addr, size); 1791 } 1792 return (error); 1793 } 1794 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1795 crargs = *(struct segvn_crargs *)argsp; 1796 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap); 1797 if (error != 0) { 1798 AS_LOCK_EXIT(as, &as->a_lock); 1799 if (unmap) { 1800 (void) as_unmap(as, addr, size); 1801 } 1802 return (error); 1803 } 1804 } else { 1805 seg = seg_alloc(as, addr, size); 1806 if (seg == NULL) { 1807 AS_LOCK_EXIT(as, &as->a_lock); 1808 return (ENOMEM); 1809 } 1810 1811 error = (*crfp)(seg, argsp); 1812 if (error != 0) { 1813 seg_free(seg); 1814 AS_LOCK_EXIT(as, &as->a_lock); 1815 return (error); 1816 } 1817 /* 1818 * Add size now so as_unmap will work if as_ctl fails. 1819 */ 1820 as->a_size += rsize; 1821 as->a_resvsize += rsize; 1822 } 1823 1824 as_setwatch(as); 1825 1826 /* 1827 * If the address space is locked, 1828 * establish memory locks for the new segment. 1829 */ 1830 mutex_enter(&as->a_contents); 1831 if (AS_ISPGLCK(as)) { 1832 mutex_exit(&as->a_contents); 1833 AS_LOCK_EXIT(as, &as->a_lock); 1834 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1835 if (error != 0) 1836 (void) as_unmap(as, addr, size); 1837 } else { 1838 mutex_exit(&as->a_contents); 1839 AS_LOCK_EXIT(as, &as->a_lock); 1840 } 1841 return (error); 1842 } 1843 1844 1845 /* 1846 * Delete all segments in the address space marked with S_PURGE. 1847 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1848 * These segments are deleted as a first step before calls to as_gap(), so 1849 * that they don't affect mmap() or shmat(). 1850 */ 1851 void 1852 as_purge(struct as *as) 1853 { 1854 struct seg *seg; 1855 struct seg *next_seg; 1856 1857 /* 1858 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1859 * no need to grab a_contents mutex for this check 1860 */ 1861 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1862 return; 1863 1864 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1865 next_seg = NULL; 1866 seg = AS_SEGFIRST(as); 1867 while (seg != NULL) { 1868 next_seg = AS_SEGNEXT(as, seg); 1869 if (seg->s_flags & S_PURGE) 1870 (void) segop_unmap(seg, seg->s_base, seg->s_size); 1871 seg = next_seg; 1872 } 1873 AS_LOCK_EXIT(as, &as->a_lock); 1874 1875 mutex_enter(&as->a_contents); 1876 as->a_flags &= ~AS_NEEDSPURGE; 1877 mutex_exit(&as->a_contents); 1878 } 1879 1880 /* 1881 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1882 * range of addresses at least "minlen" long, where the base of the range is 1883 * at "off" phase from an "align" boundary and there is space for a 1884 * "redzone"-sized redzone on eithe rside of the range. Thus, 1885 * if align was 4M and off was 16k, the user wants a hole which will start 1886 * 16k into a 4M page. 1887 * 1888 * If flags specifies AH_HI, the hole will have the highest possible address 1889 * in the range. We use the as->a_lastgap field to figure out where to 1890 * start looking for a gap. 1891 * 1892 * Otherwise, the gap will have the lowest possible address. 1893 * 1894 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1895 * 1896 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1897 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1898 * 1899 * NOTE: This routine is not correct when base+len overflows caddr_t. 1900 */ 1901 int 1902 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1903 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1904 { 1905 caddr_t lobound = *basep; 1906 caddr_t hibound = lobound + *lenp; 1907 struct seg *lseg, *hseg; 1908 caddr_t lo, hi; 1909 int forward; 1910 caddr_t save_base; 1911 size_t save_len; 1912 size_t save_minlen; 1913 size_t save_redzone; 1914 int fast_path = 1; 1915 1916 save_base = *basep; 1917 save_len = *lenp; 1918 save_minlen = minlen; 1919 save_redzone = redzone; 1920 1921 /* 1922 * For the first pass/fast_path, just add align and redzone into 1923 * minlen since if we get an allocation, we can guarantee that it 1924 * will fit the alignment and redzone requested. 1925 * This increases the chance that hibound will be adjusted to 1926 * a_lastgap->s_base which will likely allow us to find an 1927 * acceptable hole in the address space quicker. 1928 * If we can't find a hole with this fast_path, then we look for 1929 * smaller holes in which the alignment and offset may allow 1930 * the allocation to fit. 1931 */ 1932 minlen += align; 1933 minlen += 2 * redzone; 1934 redzone = 0; 1935 1936 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1937 if (AS_SEGFIRST(as) == NULL) { 1938 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1939 align, redzone, off)) { 1940 AS_LOCK_EXIT(as, &as->a_lock); 1941 return (0); 1942 } else { 1943 AS_LOCK_EXIT(as, &as->a_lock); 1944 *basep = save_base; 1945 *lenp = save_len; 1946 return (-1); 1947 } 1948 } 1949 1950 retry: 1951 /* 1952 * Set up to iterate over all the inter-segment holes in the given 1953 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1954 * NULL for the highest-addressed hole. If moving backwards, we reset 1955 * sseg to denote the highest-addressed segment. 1956 */ 1957 forward = (flags & AH_DIR) == AH_LO; 1958 if (forward) { 1959 hseg = as_findseg(as, lobound, 1); 1960 lseg = AS_SEGPREV(as, hseg); 1961 } else { 1962 1963 /* 1964 * If allocating at least as much as the last allocation, 1965 * use a_lastgap's base as a better estimate of hibound. 1966 */ 1967 if (as->a_lastgap && 1968 minlen >= as->a_lastgap->s_size && 1969 hibound >= as->a_lastgap->s_base) 1970 hibound = as->a_lastgap->s_base; 1971 1972 hseg = as_findseg(as, hibound, 1); 1973 if (hseg->s_base + hseg->s_size < hibound) { 1974 lseg = hseg; 1975 hseg = NULL; 1976 } else { 1977 lseg = AS_SEGPREV(as, hseg); 1978 } 1979 } 1980 1981 for (;;) { 1982 /* 1983 * Set lo and hi to the hole's boundaries. (We should really 1984 * use MAXADDR in place of hibound in the expression below, 1985 * but can't express it easily; using hibound in its place is 1986 * harmless.) 1987 */ 1988 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1989 hi = (hseg == NULL) ? hibound : hseg->s_base; 1990 /* 1991 * If the iteration has moved past the interval from lobound 1992 * to hibound it's pointless to continue. 1993 */ 1994 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1995 break; 1996 else if (lo > hibound || hi < lobound) 1997 goto cont; 1998 /* 1999 * Candidate hole lies at least partially within the allowable 2000 * range. Restrict it to fall completely within that range, 2001 * i.e., to [max(lo, lobound), min(hi, hibound)]. 2002 */ 2003 if (lo < lobound) 2004 lo = lobound; 2005 if (hi > hibound) 2006 hi = hibound; 2007 /* 2008 * Verify that the candidate hole is big enough and meets 2009 * hardware constraints. If the hole is too small, no need 2010 * to do the further checks since they will fail. 2011 */ 2012 *basep = lo; 2013 *lenp = hi - lo; 2014 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 2015 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 2016 ((flags & AH_CONTAIN) == 0 || 2017 (*basep <= addr && *basep + *lenp > addr))) { 2018 if (!forward) 2019 as->a_lastgap = hseg; 2020 if (hseg != NULL) 2021 as->a_lastgaphl = hseg; 2022 else 2023 as->a_lastgaphl = lseg; 2024 AS_LOCK_EXIT(as, &as->a_lock); 2025 return (0); 2026 } 2027 cont: 2028 /* 2029 * Move to the next hole. 2030 */ 2031 if (forward) { 2032 lseg = hseg; 2033 if (lseg == NULL) 2034 break; 2035 hseg = AS_SEGNEXT(as, hseg); 2036 } else { 2037 hseg = lseg; 2038 if (hseg == NULL) 2039 break; 2040 lseg = AS_SEGPREV(as, lseg); 2041 } 2042 } 2043 if (fast_path && (align != 0 || save_redzone != 0)) { 2044 fast_path = 0; 2045 minlen = save_minlen; 2046 redzone = save_redzone; 2047 goto retry; 2048 } 2049 *basep = save_base; 2050 *lenp = save_len; 2051 AS_LOCK_EXIT(as, &as->a_lock); 2052 return (-1); 2053 } 2054 2055 /* 2056 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2057 * 2058 * If flags specifies AH_HI, the hole will have the highest possible address 2059 * in the range. We use the as->a_lastgap field to figure out where to 2060 * start looking for a gap. 2061 * 2062 * Otherwise, the gap will have the lowest possible address. 2063 * 2064 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2065 * 2066 * If an adequate hole is found, base and len are set to reflect the part of 2067 * the hole that is within range, and 0 is returned, otherwise, 2068 * -1 is returned. 2069 * 2070 * NOTE: This routine is not correct when base+len overflows caddr_t. 2071 */ 2072 int 2073 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2074 caddr_t addr) 2075 { 2076 2077 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2078 } 2079 2080 /* 2081 * Return the next range within [base, base + len) that is backed 2082 * with "real memory". Skip holes and non-seg_vn segments. 2083 * We're lazy and only return one segment at a time. 2084 */ 2085 int 2086 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2087 { 2088 extern const struct seg_ops segspt_shmops; /* needs a header file */ 2089 struct seg *seg; 2090 caddr_t addr, eaddr; 2091 caddr_t segend; 2092 2093 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2094 2095 addr = *basep; 2096 eaddr = addr + *lenp; 2097 2098 seg = as_findseg(as, addr, 0); 2099 if (seg != NULL) 2100 addr = MAX(seg->s_base, addr); 2101 2102 for (;;) { 2103 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2104 AS_LOCK_EXIT(as, &as->a_lock); 2105 return (EINVAL); 2106 } 2107 2108 if (seg->s_ops == &segvn_ops) { 2109 segend = seg->s_base + seg->s_size; 2110 break; 2111 } 2112 2113 /* 2114 * We do ISM by looking into the private data 2115 * to determine the real size of the segment. 2116 */ 2117 if (seg->s_ops == &segspt_shmops) { 2118 segend = seg->s_base + spt_realsize(seg); 2119 if (addr < segend) 2120 break; 2121 } 2122 2123 seg = AS_SEGNEXT(as, seg); 2124 2125 if (seg != NULL) 2126 addr = seg->s_base; 2127 } 2128 2129 *basep = addr; 2130 2131 if (segend > eaddr) 2132 *lenp = eaddr - addr; 2133 else 2134 *lenp = segend - addr; 2135 2136 AS_LOCK_EXIT(as, &as->a_lock); 2137 return (0); 2138 } 2139 2140 /* 2141 * Swap the pages associated with the address space as out to 2142 * secondary storage, returning the number of bytes actually 2143 * swapped. 2144 * 2145 * The value returned is intended to correlate well with the process's 2146 * memory requirements. Its usefulness for this purpose depends on 2147 * how well the segment-level routines do at returning accurate 2148 * information. 2149 */ 2150 size_t 2151 as_swapout(struct as *as) 2152 { 2153 struct seg *seg; 2154 size_t swpcnt = 0; 2155 2156 /* 2157 * Kernel-only processes have given up their address 2158 * spaces. Of course, we shouldn't be attempting to 2159 * swap out such processes in the first place... 2160 */ 2161 if (as == NULL) 2162 return (0); 2163 2164 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2165 2166 /* Prevent XHATs from attaching */ 2167 mutex_enter(&as->a_contents); 2168 AS_SETBUSY(as); 2169 mutex_exit(&as->a_contents); 2170 2171 2172 /* 2173 * Free all mapping resources associated with the address 2174 * space. The segment-level swapout routines capitalize 2175 * on this unmapping by scavanging pages that have become 2176 * unmapped here. 2177 */ 2178 hat_swapout(as->a_hat); 2179 if (as->a_xhat != NULL) 2180 xhat_swapout_all(as); 2181 2182 mutex_enter(&as->a_contents); 2183 AS_CLRBUSY(as); 2184 mutex_exit(&as->a_contents); 2185 2186 /* 2187 * Call the swapout routines of all segments in the address 2188 * space to do the actual work, accumulating the amount of 2189 * space reclaimed. 2190 */ 2191 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2192 const struct seg_ops *ov = seg->s_ops; 2193 2194 /* 2195 * We have to check to see if the seg has 2196 * an ops vector because the seg may have 2197 * been in the middle of being set up when 2198 * the process was picked for swapout. 2199 */ 2200 if ((ov != NULL) && (ov->swapout != NULL)) 2201 swpcnt += segop_swapout(seg); 2202 } 2203 AS_LOCK_EXIT(as, &as->a_lock); 2204 return (swpcnt); 2205 } 2206 2207 /* 2208 * Determine whether data from the mappings in interval [addr, addr + size) 2209 * are in the primary memory (core) cache. 2210 */ 2211 int 2212 as_incore(struct as *as, caddr_t addr, 2213 size_t size, char *vec, size_t *sizep) 2214 { 2215 struct seg *seg; 2216 size_t ssize; 2217 caddr_t raddr; /* rounded down addr */ 2218 size_t rsize; /* rounded up size */ 2219 size_t isize; /* iteration size */ 2220 int error = 0; /* result, assume success */ 2221 2222 *sizep = 0; 2223 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2224 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2225 (size_t)raddr; 2226 2227 if (raddr + rsize < raddr) /* check for wraparound */ 2228 return (ENOMEM); 2229 2230 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2231 seg = as_segat(as, raddr); 2232 if (seg == NULL) { 2233 AS_LOCK_EXIT(as, &as->a_lock); 2234 return (-1); 2235 } 2236 2237 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2238 if (raddr >= seg->s_base + seg->s_size) { 2239 seg = AS_SEGNEXT(as, seg); 2240 if (seg == NULL || raddr != seg->s_base) { 2241 error = -1; 2242 break; 2243 } 2244 } 2245 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2246 ssize = seg->s_base + seg->s_size - raddr; 2247 else 2248 ssize = rsize; 2249 *sizep += isize = segop_incore(seg, raddr, ssize, vec); 2250 if (isize != ssize) { 2251 error = -1; 2252 break; 2253 } 2254 vec += btopr(ssize); 2255 } 2256 AS_LOCK_EXIT(as, &as->a_lock); 2257 return (error); 2258 } 2259 2260 static void 2261 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2262 ulong_t *bitmap, size_t position, size_t npages) 2263 { 2264 caddr_t range_start; 2265 size_t pos1 = position; 2266 size_t pos2; 2267 size_t size; 2268 size_t end_pos = npages + position; 2269 2270 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2271 size = ptob((pos2 - pos1)); 2272 range_start = (caddr_t)((uintptr_t)addr + 2273 ptob(pos1 - position)); 2274 2275 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK, 2276 (ulong_t *)NULL, (size_t)NULL); 2277 pos1 = pos2; 2278 } 2279 } 2280 2281 static void 2282 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2283 caddr_t raddr, size_t rsize) 2284 { 2285 struct seg *seg = as_segat(as, raddr); 2286 size_t ssize; 2287 2288 while (rsize != 0) { 2289 if (raddr >= seg->s_base + seg->s_size) 2290 seg = AS_SEGNEXT(as, seg); 2291 2292 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2293 ssize = seg->s_base + seg->s_size - raddr; 2294 else 2295 ssize = rsize; 2296 2297 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2298 2299 rsize -= ssize; 2300 raddr += ssize; 2301 } 2302 } 2303 2304 /* 2305 * Cache control operations over the interval [addr, addr + size) in 2306 * address space "as". 2307 */ 2308 /*ARGSUSED*/ 2309 int 2310 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2311 uintptr_t arg, ulong_t *lock_map, size_t pos) 2312 { 2313 struct seg *seg; /* working segment */ 2314 caddr_t raddr; /* rounded down addr */ 2315 caddr_t initraddr; /* saved initial rounded down addr */ 2316 size_t rsize; /* rounded up size */ 2317 size_t initrsize; /* saved initial rounded up size */ 2318 size_t ssize; /* size of seg */ 2319 int error = 0; /* result */ 2320 size_t mlock_size; /* size of bitmap */ 2321 ulong_t *mlock_map; /* pointer to bitmap used */ 2322 /* to represent the locked */ 2323 /* pages. */ 2324 retry: 2325 if (error == IE_RETRY) 2326 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2327 else 2328 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2329 2330 /* 2331 * If these are address space lock/unlock operations, loop over 2332 * all segments in the address space, as appropriate. 2333 */ 2334 if (func == MC_LOCKAS) { 2335 size_t npages, idx; 2336 size_t rlen = 0; /* rounded as length */ 2337 2338 idx = pos; 2339 2340 if (arg & MCL_FUTURE) { 2341 mutex_enter(&as->a_contents); 2342 AS_SETPGLCK(as); 2343 mutex_exit(&as->a_contents); 2344 } 2345 if ((arg & MCL_CURRENT) == 0) { 2346 AS_LOCK_EXIT(as, &as->a_lock); 2347 return (0); 2348 } 2349 2350 seg = AS_SEGFIRST(as); 2351 if (seg == NULL) { 2352 AS_LOCK_EXIT(as, &as->a_lock); 2353 return (0); 2354 } 2355 2356 do { 2357 raddr = (caddr_t)((uintptr_t)seg->s_base & 2358 (uintptr_t)PAGEMASK); 2359 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2360 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2361 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2362 2363 mlock_size = BT_BITOUL(btopr(rlen)); 2364 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2365 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2366 AS_LOCK_EXIT(as, &as->a_lock); 2367 return (EAGAIN); 2368 } 2369 2370 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2371 error = segop_lockop(seg, seg->s_base, 2372 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2373 if (error != 0) 2374 break; 2375 pos += seg_pages(seg); 2376 } 2377 2378 if (error) { 2379 for (seg = AS_SEGFIRST(as); seg != NULL; 2380 seg = AS_SEGNEXT(as, seg)) { 2381 2382 raddr = (caddr_t)((uintptr_t)seg->s_base & 2383 (uintptr_t)PAGEMASK); 2384 npages = seg_pages(seg); 2385 as_segunlock(seg, raddr, attr, mlock_map, 2386 idx, npages); 2387 idx += npages; 2388 } 2389 } 2390 2391 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2392 AS_LOCK_EXIT(as, &as->a_lock); 2393 goto lockerr; 2394 } else if (func == MC_UNLOCKAS) { 2395 mutex_enter(&as->a_contents); 2396 AS_CLRPGLCK(as); 2397 mutex_exit(&as->a_contents); 2398 2399 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2400 error = segop_lockop(seg, seg->s_base, 2401 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2402 if (error != 0) 2403 break; 2404 } 2405 2406 AS_LOCK_EXIT(as, &as->a_lock); 2407 goto lockerr; 2408 } 2409 2410 /* 2411 * Normalize addresses and sizes. 2412 */ 2413 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2414 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2415 (size_t)raddr; 2416 2417 if (raddr + rsize < raddr) { /* check for wraparound */ 2418 AS_LOCK_EXIT(as, &as->a_lock); 2419 return (ENOMEM); 2420 } 2421 2422 /* 2423 * Get initial segment. 2424 */ 2425 if ((seg = as_segat(as, raddr)) == NULL) { 2426 AS_LOCK_EXIT(as, &as->a_lock); 2427 return (ENOMEM); 2428 } 2429 2430 if (func == MC_LOCK) { 2431 mlock_size = BT_BITOUL(btopr(rsize)); 2432 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2433 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2434 AS_LOCK_EXIT(as, &as->a_lock); 2435 return (EAGAIN); 2436 } 2437 } 2438 2439 /* 2440 * Loop over all segments. If a hole in the address range is 2441 * discovered, then fail. For each segment, perform the appropriate 2442 * control operation. 2443 */ 2444 while (rsize != 0) { 2445 2446 /* 2447 * Make sure there's no hole, calculate the portion 2448 * of the next segment to be operated over. 2449 */ 2450 if (raddr >= seg->s_base + seg->s_size) { 2451 seg = AS_SEGNEXT(as, seg); 2452 if (seg == NULL || raddr != seg->s_base) { 2453 if (func == MC_LOCK) { 2454 as_unlockerr(as, attr, mlock_map, 2455 initraddr, initrsize - rsize); 2456 kmem_free(mlock_map, 2457 mlock_size * sizeof (ulong_t)); 2458 } 2459 AS_LOCK_EXIT(as, &as->a_lock); 2460 return (ENOMEM); 2461 } 2462 } 2463 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2464 ssize = seg->s_base + seg->s_size - raddr; 2465 else 2466 ssize = rsize; 2467 2468 /* 2469 * Dispatch on specific function. 2470 */ 2471 switch (func) { 2472 2473 /* 2474 * Synchronize cached data from mappings with backing 2475 * objects. 2476 */ 2477 case MC_SYNC: 2478 if (error = segop_sync(seg, raddr, ssize, 2479 attr, (uint_t)arg)) { 2480 AS_LOCK_EXIT(as, &as->a_lock); 2481 return (error); 2482 } 2483 break; 2484 2485 /* 2486 * Lock pages in memory. 2487 */ 2488 case MC_LOCK: 2489 if (error = segop_lockop(seg, raddr, ssize, 2490 attr, func, mlock_map, pos)) { 2491 as_unlockerr(as, attr, mlock_map, initraddr, 2492 initrsize - rsize + ssize); 2493 kmem_free(mlock_map, mlock_size * 2494 sizeof (ulong_t)); 2495 AS_LOCK_EXIT(as, &as->a_lock); 2496 goto lockerr; 2497 } 2498 break; 2499 2500 /* 2501 * Unlock mapped pages. 2502 */ 2503 case MC_UNLOCK: 2504 (void) segop_lockop(seg, raddr, ssize, attr, func, 2505 (ulong_t *)NULL, (size_t)NULL); 2506 break; 2507 2508 /* 2509 * Store VM advise for mapped pages in segment layer. 2510 */ 2511 case MC_ADVISE: 2512 error = segop_advise(seg, raddr, ssize, (uint_t)arg); 2513 2514 /* 2515 * Check for regular errors and special retry error 2516 */ 2517 if (error) { 2518 if (error == IE_RETRY) { 2519 /* 2520 * Need to acquire writers lock, so 2521 * have to drop readers lock and start 2522 * all over again 2523 */ 2524 AS_LOCK_EXIT(as, &as->a_lock); 2525 goto retry; 2526 } else if (error == IE_REATTACH) { 2527 /* 2528 * Find segment for current address 2529 * because current segment just got 2530 * split or concatenated 2531 */ 2532 seg = as_segat(as, raddr); 2533 if (seg == NULL) { 2534 AS_LOCK_EXIT(as, &as->a_lock); 2535 return (ENOMEM); 2536 } 2537 } else { 2538 /* 2539 * Regular error 2540 */ 2541 AS_LOCK_EXIT(as, &as->a_lock); 2542 return (error); 2543 } 2544 } 2545 break; 2546 2547 case MC_INHERIT_ZERO: 2548 error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO); 2549 if (error != 0) { 2550 AS_LOCK_EXIT(as, &as->a_lock); 2551 return (error); 2552 } 2553 break; 2554 2555 /* 2556 * Can't happen. 2557 */ 2558 default: 2559 panic("as_ctl: bad operation %d", func); 2560 /*NOTREACHED*/ 2561 } 2562 2563 rsize -= ssize; 2564 raddr += ssize; 2565 } 2566 2567 if (func == MC_LOCK) 2568 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2569 AS_LOCK_EXIT(as, &as->a_lock); 2570 return (0); 2571 lockerr: 2572 2573 /* 2574 * If the lower levels returned EDEADLK for a segment lockop, 2575 * it means that we should retry the operation. Let's wait 2576 * a bit also to let the deadlock causing condition clear. 2577 * This is part of a gross hack to work around a design flaw 2578 * in the ufs/sds logging code and should go away when the 2579 * logging code is re-designed to fix the problem. See bug 2580 * 4125102 for details of the problem. 2581 */ 2582 if (error == EDEADLK) { 2583 delay(deadlk_wait); 2584 error = 0; 2585 goto retry; 2586 } 2587 return (error); 2588 } 2589 2590 int 2591 fc_decode(faultcode_t fault_err) 2592 { 2593 int error = 0; 2594 2595 switch (FC_CODE(fault_err)) { 2596 case FC_OBJERR: 2597 error = FC_ERRNO(fault_err); 2598 break; 2599 case FC_PROT: 2600 error = EACCES; 2601 break; 2602 default: 2603 error = EFAULT; 2604 break; 2605 } 2606 return (error); 2607 } 2608 2609 /* 2610 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2611 * lists from each segment and copy them to one contiguous shadow list (plist) 2612 * as expected by the caller. Save pointers to per segment shadow lists at 2613 * the tail of plist so that they can be used during as_pageunlock(). 2614 */ 2615 static int 2616 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2617 caddr_t addr, size_t size, enum seg_rw rw) 2618 { 2619 caddr_t sv_addr = addr; 2620 size_t sv_size = size; 2621 struct seg *sv_seg = seg; 2622 ulong_t segcnt = 1; 2623 ulong_t cnt; 2624 size_t ssize; 2625 pgcnt_t npages = btop(size); 2626 page_t **plist; 2627 page_t **pl; 2628 int error; 2629 caddr_t eaddr; 2630 faultcode_t fault_err = 0; 2631 pgcnt_t pl_off; 2632 extern const struct seg_ops segspt_shmops; 2633 2634 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2635 ASSERT(seg != NULL); 2636 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2637 ASSERT(addr + size > seg->s_base + seg->s_size); 2638 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2639 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2640 2641 /* 2642 * Count the number of segments covered by the range we are about to 2643 * lock. The segment count is used to size the shadow list we return 2644 * back to the caller. 2645 */ 2646 for (; size != 0; size -= ssize, addr += ssize) { 2647 if (addr >= seg->s_base + seg->s_size) { 2648 2649 seg = AS_SEGNEXT(as, seg); 2650 if (seg == NULL || addr != seg->s_base) { 2651 AS_LOCK_EXIT(as, &as->a_lock); 2652 return (EFAULT); 2653 } 2654 /* 2655 * Do a quick check if subsequent segments 2656 * will most likely support pagelock. 2657 */ 2658 if (seg->s_ops == &segvn_ops) { 2659 vnode_t *vp; 2660 2661 if (segop_getvp(seg, addr, &vp) != 0 || 2662 vp != NULL) { 2663 AS_LOCK_EXIT(as, &as->a_lock); 2664 goto slow; 2665 } 2666 } else if (seg->s_ops != &segspt_shmops) { 2667 AS_LOCK_EXIT(as, &as->a_lock); 2668 goto slow; 2669 } 2670 segcnt++; 2671 } 2672 if (addr + size > seg->s_base + seg->s_size) { 2673 ssize = seg->s_base + seg->s_size - addr; 2674 } else { 2675 ssize = size; 2676 } 2677 } 2678 ASSERT(segcnt > 1); 2679 2680 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2681 2682 addr = sv_addr; 2683 size = sv_size; 2684 seg = sv_seg; 2685 2686 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2687 if (addr >= seg->s_base + seg->s_size) { 2688 seg = AS_SEGNEXT(as, seg); 2689 ASSERT(seg != NULL && addr == seg->s_base); 2690 cnt++; 2691 ASSERT(cnt < segcnt); 2692 } 2693 if (addr + size > seg->s_base + seg->s_size) { 2694 ssize = seg->s_base + seg->s_size - addr; 2695 } else { 2696 ssize = size; 2697 } 2698 pl = &plist[npages + cnt]; 2699 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl, 2700 L_PAGELOCK, rw); 2701 if (error) { 2702 break; 2703 } 2704 ASSERT(plist[npages + cnt] != NULL); 2705 ASSERT(pl_off + btop(ssize) <= npages); 2706 bcopy(plist[npages + cnt], &plist[pl_off], 2707 btop(ssize) * sizeof (page_t *)); 2708 pl_off += btop(ssize); 2709 } 2710 2711 if (size == 0) { 2712 AS_LOCK_EXIT(as, &as->a_lock); 2713 ASSERT(cnt == segcnt - 1); 2714 *ppp = plist; 2715 return (0); 2716 } 2717 2718 /* 2719 * one of pagelock calls failed. The error type is in error variable. 2720 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2721 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2722 * back to the caller. 2723 */ 2724 2725 eaddr = addr; 2726 seg = sv_seg; 2727 2728 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2729 if (addr >= seg->s_base + seg->s_size) { 2730 seg = AS_SEGNEXT(as, seg); 2731 ASSERT(seg != NULL && addr == seg->s_base); 2732 cnt++; 2733 ASSERT(cnt < segcnt); 2734 } 2735 if (eaddr > seg->s_base + seg->s_size) { 2736 ssize = seg->s_base + seg->s_size - addr; 2737 } else { 2738 ssize = eaddr - addr; 2739 } 2740 pl = &plist[npages + cnt]; 2741 ASSERT(*pl != NULL); 2742 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl, 2743 L_PAGEUNLOCK, rw); 2744 } 2745 2746 AS_LOCK_EXIT(as, &as->a_lock); 2747 2748 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2749 2750 if (error != ENOTSUP && error != EFAULT) { 2751 return (error); 2752 } 2753 2754 slow: 2755 /* 2756 * If we are here because pagelock failed due to the need to cow fault 2757 * in the pages we want to lock F_SOFTLOCK will do this job and in 2758 * next as_pagelock() call for this address range pagelock will 2759 * hopefully succeed. 2760 */ 2761 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2762 if (fault_err != 0) { 2763 return (fc_decode(fault_err)); 2764 } 2765 *ppp = NULL; 2766 2767 return (0); 2768 } 2769 2770 /* 2771 * lock pages in a given address space. Return shadow list. If 2772 * the list is NULL, the MMU mapping is also locked. 2773 */ 2774 int 2775 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2776 size_t size, enum seg_rw rw) 2777 { 2778 size_t rsize; 2779 caddr_t raddr; 2780 faultcode_t fault_err; 2781 struct seg *seg; 2782 int err; 2783 2784 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2785 "as_pagelock_start: addr %p size %ld", addr, size); 2786 2787 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2788 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2789 (size_t)raddr; 2790 2791 /* 2792 * if the request crosses two segments let 2793 * as_fault handle it. 2794 */ 2795 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2796 2797 seg = as_segat(as, raddr); 2798 if (seg == NULL) { 2799 AS_LOCK_EXIT(as, &as->a_lock); 2800 return (EFAULT); 2801 } 2802 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2803 if (raddr + rsize > seg->s_base + seg->s_size) { 2804 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2805 } 2806 if (raddr + rsize <= raddr) { 2807 AS_LOCK_EXIT(as, &as->a_lock); 2808 return (EFAULT); 2809 } 2810 2811 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2812 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2813 2814 /* 2815 * try to lock pages and pass back shadow list 2816 */ 2817 err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2818 2819 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2820 2821 AS_LOCK_EXIT(as, &as->a_lock); 2822 2823 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2824 return (err); 2825 } 2826 2827 /* 2828 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2829 * to no pagelock support for this segment or pages need to be cow 2830 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2831 * this as_pagelock() call and in the next as_pagelock() call for the 2832 * same address range pagelock call will hopefull succeed. 2833 */ 2834 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2835 if (fault_err != 0) { 2836 return (fc_decode(fault_err)); 2837 } 2838 *ppp = NULL; 2839 2840 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2841 return (0); 2842 } 2843 2844 /* 2845 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2846 * lists from the end of plist and call pageunlock interface for each segment. 2847 * Drop as lock and free plist. 2848 */ 2849 static void 2850 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2851 struct page **plist, enum seg_rw rw) 2852 { 2853 ulong_t cnt; 2854 caddr_t eaddr = addr + size; 2855 pgcnt_t npages = btop(size); 2856 size_t ssize; 2857 page_t **pl; 2858 2859 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2860 ASSERT(seg != NULL); 2861 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2862 ASSERT(addr + size > seg->s_base + seg->s_size); 2863 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2864 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2865 ASSERT(plist != NULL); 2866 2867 for (cnt = 0; addr < eaddr; addr += ssize) { 2868 if (addr >= seg->s_base + seg->s_size) { 2869 seg = AS_SEGNEXT(as, seg); 2870 ASSERT(seg != NULL && addr == seg->s_base); 2871 cnt++; 2872 } 2873 if (eaddr > seg->s_base + seg->s_size) { 2874 ssize = seg->s_base + seg->s_size - addr; 2875 } else { 2876 ssize = eaddr - addr; 2877 } 2878 pl = &plist[npages + cnt]; 2879 ASSERT(*pl != NULL); 2880 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl, 2881 L_PAGEUNLOCK, rw); 2882 } 2883 ASSERT(cnt > 0); 2884 AS_LOCK_EXIT(as, &as->a_lock); 2885 2886 cnt++; 2887 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2888 } 2889 2890 /* 2891 * unlock pages in a given address range 2892 */ 2893 void 2894 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2895 enum seg_rw rw) 2896 { 2897 struct seg *seg; 2898 size_t rsize; 2899 caddr_t raddr; 2900 2901 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2902 "as_pageunlock_start: addr %p size %ld", addr, size); 2903 2904 /* 2905 * if the shadow list is NULL, as_pagelock was 2906 * falling back to as_fault 2907 */ 2908 if (pp == NULL) { 2909 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2910 return; 2911 } 2912 2913 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2914 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2915 (size_t)raddr; 2916 2917 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2918 seg = as_segat(as, raddr); 2919 ASSERT(seg != NULL); 2920 2921 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2922 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2923 2924 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2925 if (raddr + rsize <= seg->s_base + seg->s_size) { 2926 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2927 } else { 2928 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2929 return; 2930 } 2931 AS_LOCK_EXIT(as, &as->a_lock); 2932 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2933 } 2934 2935 int 2936 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2937 boolean_t wait) 2938 { 2939 struct seg *seg; 2940 size_t ssize; 2941 caddr_t raddr; /* rounded down addr */ 2942 size_t rsize; /* rounded up size */ 2943 int error = 0; 2944 size_t pgsz = page_get_pagesize(szc); 2945 2946 setpgsz_top: 2947 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2948 return (EINVAL); 2949 } 2950 2951 raddr = addr; 2952 rsize = size; 2953 2954 if (raddr + rsize < raddr) /* check for wraparound */ 2955 return (ENOMEM); 2956 2957 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2958 as_clearwatchprot(as, raddr, rsize); 2959 seg = as_segat(as, raddr); 2960 if (seg == NULL) { 2961 as_setwatch(as); 2962 AS_LOCK_EXIT(as, &as->a_lock); 2963 return (ENOMEM); 2964 } 2965 2966 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2967 if (raddr >= seg->s_base + seg->s_size) { 2968 seg = AS_SEGNEXT(as, seg); 2969 if (seg == NULL || raddr != seg->s_base) { 2970 error = ENOMEM; 2971 break; 2972 } 2973 } 2974 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2975 ssize = seg->s_base + seg->s_size - raddr; 2976 } else { 2977 ssize = rsize; 2978 } 2979 2980 retry: 2981 error = segop_setpagesize(seg, raddr, ssize, szc); 2982 2983 if (error == IE_NOMEM) { 2984 error = EAGAIN; 2985 break; 2986 } 2987 2988 if (error == IE_RETRY) { 2989 AS_LOCK_EXIT(as, &as->a_lock); 2990 goto setpgsz_top; 2991 } 2992 2993 if (error == ENOTSUP) { 2994 error = EINVAL; 2995 break; 2996 } 2997 2998 if (wait && (error == EAGAIN)) { 2999 /* 3000 * Memory is currently locked. It must be unlocked 3001 * before this operation can succeed through a retry. 3002 * The possible reasons for locked memory and 3003 * corresponding strategies for unlocking are: 3004 * (1) Normal I/O 3005 * wait for a signal that the I/O operation 3006 * has completed and the memory is unlocked. 3007 * (2) Asynchronous I/O 3008 * The aio subsystem does not unlock pages when 3009 * the I/O is completed. Those pages are unlocked 3010 * when the application calls aiowait/aioerror. 3011 * So, to prevent blocking forever, cv_broadcast() 3012 * is done to wake up aio_cleanup_thread. 3013 * Subsequently, segvn_reclaim will be called, and 3014 * that will do AS_CLRUNMAPWAIT() and wake us up. 3015 * (3) Long term page locking: 3016 * This is not relevant for as_setpagesize() 3017 * because we cannot change the page size for 3018 * driver memory. The attempt to do so will 3019 * fail with a different error than EAGAIN so 3020 * there's no need to trigger as callbacks like 3021 * as_unmap, as_setprot or as_free would do. 3022 */ 3023 mutex_enter(&as->a_contents); 3024 if (!AS_ISNOUNMAPWAIT(as)) { 3025 if (AS_ISUNMAPWAIT(as) == 0) { 3026 cv_broadcast(&as->a_cv); 3027 } 3028 AS_SETUNMAPWAIT(as); 3029 AS_LOCK_EXIT(as, &as->a_lock); 3030 while (AS_ISUNMAPWAIT(as)) { 3031 cv_wait(&as->a_cv, &as->a_contents); 3032 } 3033 } else { 3034 /* 3035 * We may have raced with 3036 * segvn_reclaim()/segspt_reclaim(). In this 3037 * case clean nounmapwait flag and retry since 3038 * softlockcnt in this segment may be already 3039 * 0. We don't drop as writer lock so our 3040 * number of retries without sleeping should 3041 * be very small. See segvn_reclaim() for 3042 * more comments. 3043 */ 3044 AS_CLRNOUNMAPWAIT(as); 3045 mutex_exit(&as->a_contents); 3046 goto retry; 3047 } 3048 mutex_exit(&as->a_contents); 3049 goto setpgsz_top; 3050 } else if (error != 0) { 3051 break; 3052 } 3053 } 3054 as_setwatch(as); 3055 AS_LOCK_EXIT(as, &as->a_lock); 3056 return (error); 3057 } 3058 3059 /* 3060 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments 3061 * in its chunk where s_szc is less than the szc we want to set. 3062 */ 3063 static int 3064 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3065 int *retry) 3066 { 3067 struct seg *seg; 3068 size_t ssize; 3069 int error; 3070 3071 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3072 3073 seg = as_segat(as, raddr); 3074 if (seg == NULL) { 3075 panic("as_iset3_default_lpsize: no seg"); 3076 } 3077 3078 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3079 if (raddr >= seg->s_base + seg->s_size) { 3080 seg = AS_SEGNEXT(as, seg); 3081 if (seg == NULL || raddr != seg->s_base) { 3082 panic("as_iset3_default_lpsize: as changed"); 3083 } 3084 } 3085 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3086 ssize = seg->s_base + seg->s_size - raddr; 3087 } else { 3088 ssize = rsize; 3089 } 3090 3091 if (szc > seg->s_szc) { 3092 error = segop_setpagesize(seg, raddr, ssize, szc); 3093 /* Only retry on EINVAL segments that have no vnode. */ 3094 if (error == EINVAL) { 3095 vnode_t *vp = NULL; 3096 if ((segop_gettype(seg, raddr) & MAP_SHARED) && 3097 (segop_getvp(seg, raddr, &vp) != 0 || 3098 vp == NULL)) { 3099 *retry = 1; 3100 } else { 3101 *retry = 0; 3102 } 3103 } 3104 if (error) { 3105 return (error); 3106 } 3107 } 3108 } 3109 return (0); 3110 } 3111 3112 /* 3113 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3114 * pagesize on each segment in its range, but if any fails with EINVAL, 3115 * then it reduces the pagesizes to the next size in the bitmap and 3116 * retries as_iset3_default_lpsize(). The reason why the code retries 3117 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3118 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3119 * with) to pass to map_pgszcvec(). 3120 */ 3121 static int 3122 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3123 uint_t szcvec) 3124 { 3125 int error; 3126 int retry; 3127 3128 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3129 3130 for (;;) { 3131 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3132 if (error == EINVAL && retry) { 3133 szcvec &= ~(1 << szc); 3134 if (szcvec <= 1) { 3135 return (EINVAL); 3136 } 3137 szc = highbit(szcvec) - 1; 3138 } else { 3139 return (error); 3140 } 3141 } 3142 } 3143 3144 /* 3145 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3146 * segments have a smaller szc than we want to set. For each such area, 3147 * it calls as_iset2_default_lpsize() 3148 */ 3149 static int 3150 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3151 uint_t szcvec) 3152 { 3153 struct seg *seg; 3154 size_t ssize; 3155 caddr_t setaddr = raddr; 3156 size_t setsize = 0; 3157 int set; 3158 int error; 3159 3160 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3161 3162 seg = as_segat(as, raddr); 3163 if (seg == NULL) { 3164 panic("as_iset1_default_lpsize: no seg"); 3165 } 3166 if (seg->s_szc < szc) { 3167 set = 1; 3168 } else { 3169 set = 0; 3170 } 3171 3172 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3173 if (raddr >= seg->s_base + seg->s_size) { 3174 seg = AS_SEGNEXT(as, seg); 3175 if (seg == NULL || raddr != seg->s_base) { 3176 panic("as_iset1_default_lpsize: as changed"); 3177 } 3178 if (seg->s_szc >= szc && set) { 3179 ASSERT(setsize != 0); 3180 error = as_iset2_default_lpsize(as, 3181 setaddr, setsize, szc, szcvec); 3182 if (error) { 3183 return (error); 3184 } 3185 set = 0; 3186 } else if (seg->s_szc < szc && !set) { 3187 setaddr = raddr; 3188 setsize = 0; 3189 set = 1; 3190 } 3191 } 3192 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3193 ssize = seg->s_base + seg->s_size - raddr; 3194 } else { 3195 ssize = rsize; 3196 } 3197 } 3198 error = 0; 3199 if (set) { 3200 ASSERT(setsize != 0); 3201 error = as_iset2_default_lpsize(as, setaddr, setsize, 3202 szc, szcvec); 3203 } 3204 return (error); 3205 } 3206 3207 /* 3208 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3209 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3210 * chunk to as_iset1_default_lpsize(). 3211 */ 3212 static int 3213 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3214 int type) 3215 { 3216 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3217 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3218 flags, rtype, 1); 3219 uint_t szc; 3220 uint_t nszc; 3221 int error; 3222 caddr_t a; 3223 caddr_t eaddr; 3224 size_t segsize; 3225 size_t pgsz; 3226 uint_t save_szcvec; 3227 3228 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3229 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3230 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3231 3232 szcvec &= ~1; 3233 if (szcvec <= 1) { /* skip if base page size */ 3234 return (0); 3235 } 3236 3237 /* Get the pagesize of the first larger page size. */ 3238 szc = lowbit(szcvec) - 1; 3239 pgsz = page_get_pagesize(szc); 3240 eaddr = addr + size; 3241 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3242 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3243 3244 save_szcvec = szcvec; 3245 szcvec >>= (szc + 1); 3246 nszc = szc; 3247 while (szcvec) { 3248 if ((szcvec & 0x1) == 0) { 3249 nszc++; 3250 szcvec >>= 1; 3251 continue; 3252 } 3253 nszc++; 3254 pgsz = page_get_pagesize(nszc); 3255 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3256 if (a != addr) { 3257 ASSERT(szc > 0); 3258 ASSERT(a < eaddr); 3259 segsize = a - addr; 3260 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3261 save_szcvec); 3262 if (error) { 3263 return (error); 3264 } 3265 addr = a; 3266 } 3267 szc = nszc; 3268 szcvec >>= 1; 3269 } 3270 3271 ASSERT(addr < eaddr); 3272 szcvec = save_szcvec; 3273 while (szcvec) { 3274 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3275 ASSERT(a >= addr); 3276 if (a != addr) { 3277 ASSERT(szc > 0); 3278 segsize = a - addr; 3279 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3280 save_szcvec); 3281 if (error) { 3282 return (error); 3283 } 3284 addr = a; 3285 } 3286 szcvec &= ~(1 << szc); 3287 if (szcvec) { 3288 szc = highbit(szcvec) - 1; 3289 pgsz = page_get_pagesize(szc); 3290 } 3291 } 3292 ASSERT(addr == eaddr); 3293 3294 return (0); 3295 } 3296 3297 /* 3298 * Set the default large page size for the range. Called via memcntl with 3299 * page size set to 0. as_set_default_lpsize breaks the range down into 3300 * chunks with the same type/flags, ignores-non segvn segments, and passes 3301 * each chunk to as_iset_default_lpsize(). 3302 */ 3303 int 3304 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3305 { 3306 struct seg *seg; 3307 caddr_t raddr; 3308 size_t rsize; 3309 size_t ssize; 3310 int rtype, rflags; 3311 int stype, sflags; 3312 int error; 3313 caddr_t setaddr; 3314 size_t setsize; 3315 int segvn; 3316 3317 if (size == 0) 3318 return (0); 3319 3320 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3321 again: 3322 error = 0; 3323 3324 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3325 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3326 (size_t)raddr; 3327 3328 if (raddr + rsize < raddr) { /* check for wraparound */ 3329 AS_LOCK_EXIT(as, &as->a_lock); 3330 return (ENOMEM); 3331 } 3332 as_clearwatchprot(as, raddr, rsize); 3333 seg = as_segat(as, raddr); 3334 if (seg == NULL) { 3335 as_setwatch(as); 3336 AS_LOCK_EXIT(as, &as->a_lock); 3337 return (ENOMEM); 3338 } 3339 if (seg->s_ops == &segvn_ops) { 3340 rtype = segop_gettype(seg, addr); 3341 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3342 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3343 segvn = 1; 3344 } else { 3345 segvn = 0; 3346 } 3347 setaddr = raddr; 3348 setsize = 0; 3349 3350 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3351 if (raddr >= (seg->s_base + seg->s_size)) { 3352 seg = AS_SEGNEXT(as, seg); 3353 if (seg == NULL || raddr != seg->s_base) { 3354 error = ENOMEM; 3355 break; 3356 } 3357 if (seg->s_ops == &segvn_ops) { 3358 stype = segop_gettype(seg, raddr); 3359 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3360 stype &= (MAP_SHARED | MAP_PRIVATE); 3361 if (segvn && (rflags != sflags || 3362 rtype != stype)) { 3363 /* 3364 * The next segment is also segvn but 3365 * has different flags and/or type. 3366 */ 3367 ASSERT(setsize != 0); 3368 error = as_iset_default_lpsize(as, 3369 setaddr, setsize, rflags, rtype); 3370 if (error) { 3371 break; 3372 } 3373 rflags = sflags; 3374 rtype = stype; 3375 setaddr = raddr; 3376 setsize = 0; 3377 } else if (!segvn) { 3378 rflags = sflags; 3379 rtype = stype; 3380 setaddr = raddr; 3381 setsize = 0; 3382 segvn = 1; 3383 } 3384 } else if (segvn) { 3385 /* The next segment is not segvn. */ 3386 ASSERT(setsize != 0); 3387 error = as_iset_default_lpsize(as, 3388 setaddr, setsize, rflags, rtype); 3389 if (error) { 3390 break; 3391 } 3392 segvn = 0; 3393 } 3394 } 3395 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3396 ssize = seg->s_base + seg->s_size - raddr; 3397 } else { 3398 ssize = rsize; 3399 } 3400 } 3401 if (error == 0 && segvn) { 3402 /* The last chunk when rsize == 0. */ 3403 ASSERT(setsize != 0); 3404 error = as_iset_default_lpsize(as, setaddr, setsize, 3405 rflags, rtype); 3406 } 3407 3408 if (error == IE_RETRY) { 3409 goto again; 3410 } else if (error == IE_NOMEM) { 3411 error = EAGAIN; 3412 } else if (error == ENOTSUP) { 3413 error = EINVAL; 3414 } else if (error == EAGAIN) { 3415 mutex_enter(&as->a_contents); 3416 if (!AS_ISNOUNMAPWAIT(as)) { 3417 if (AS_ISUNMAPWAIT(as) == 0) { 3418 cv_broadcast(&as->a_cv); 3419 } 3420 AS_SETUNMAPWAIT(as); 3421 AS_LOCK_EXIT(as, &as->a_lock); 3422 while (AS_ISUNMAPWAIT(as)) { 3423 cv_wait(&as->a_cv, &as->a_contents); 3424 } 3425 mutex_exit(&as->a_contents); 3426 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3427 } else { 3428 /* 3429 * We may have raced with 3430 * segvn_reclaim()/segspt_reclaim(). In this case 3431 * clean nounmapwait flag and retry since softlockcnt 3432 * in this segment may be already 0. We don't drop as 3433 * writer lock so our number of retries without 3434 * sleeping should be very small. See segvn_reclaim() 3435 * for more comments. 3436 */ 3437 AS_CLRNOUNMAPWAIT(as); 3438 mutex_exit(&as->a_contents); 3439 } 3440 goto again; 3441 } 3442 3443 as_setwatch(as); 3444 AS_LOCK_EXIT(as, &as->a_lock); 3445 return (error); 3446 } 3447 3448 /* 3449 * Setup all of the uninitialized watched pages that we can. 3450 */ 3451 void 3452 as_setwatch(struct as *as) 3453 { 3454 struct watched_page *pwp; 3455 struct seg *seg; 3456 caddr_t vaddr; 3457 uint_t prot; 3458 int err, retrycnt; 3459 3460 if (avl_numnodes(&as->a_wpage) == 0) 3461 return; 3462 3463 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3464 3465 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3466 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3467 retrycnt = 0; 3468 retry: 3469 vaddr = pwp->wp_vaddr; 3470 if (pwp->wp_oprot != 0 || /* already set up */ 3471 (seg = as_segat(as, vaddr)) == NULL || 3472 segop_getprot(seg, vaddr, 0, &prot) != 0) 3473 continue; 3474 3475 pwp->wp_oprot = prot; 3476 if (pwp->wp_read) 3477 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3478 if (pwp->wp_write) 3479 prot &= ~PROT_WRITE; 3480 if (pwp->wp_exec) 3481 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3482 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3483 err = segop_setprot(seg, vaddr, PAGESIZE, prot); 3484 if (err == IE_RETRY) { 3485 pwp->wp_oprot = 0; 3486 ASSERT(retrycnt == 0); 3487 retrycnt++; 3488 goto retry; 3489 } 3490 } 3491 pwp->wp_prot = prot; 3492 } 3493 } 3494 3495 /* 3496 * Clear all of the watched pages in the address space. 3497 */ 3498 void 3499 as_clearwatch(struct as *as) 3500 { 3501 struct watched_page *pwp; 3502 struct seg *seg; 3503 caddr_t vaddr; 3504 uint_t prot; 3505 int err, retrycnt; 3506 3507 if (avl_numnodes(&as->a_wpage) == 0) 3508 return; 3509 3510 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3511 3512 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3513 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3514 retrycnt = 0; 3515 retry: 3516 vaddr = pwp->wp_vaddr; 3517 if (pwp->wp_oprot == 0 || /* not set up */ 3518 (seg = as_segat(as, vaddr)) == NULL) 3519 continue; 3520 3521 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3522 err = segop_setprot(seg, vaddr, PAGESIZE, prot); 3523 if (err == IE_RETRY) { 3524 ASSERT(retrycnt == 0); 3525 retrycnt++; 3526 goto retry; 3527 } 3528 } 3529 pwp->wp_oprot = 0; 3530 pwp->wp_prot = 0; 3531 } 3532 } 3533 3534 /* 3535 * Force a new setup for all the watched pages in the range. 3536 */ 3537 static void 3538 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3539 { 3540 struct watched_page *pwp; 3541 struct watched_page tpw; 3542 caddr_t eaddr = addr + size; 3543 caddr_t vaddr; 3544 struct seg *seg; 3545 int err, retrycnt; 3546 uint_t wprot; 3547 avl_index_t where; 3548 3549 if (avl_numnodes(&as->a_wpage) == 0) 3550 return; 3551 3552 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3553 3554 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3555 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3556 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3557 3558 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3559 retrycnt = 0; 3560 vaddr = pwp->wp_vaddr; 3561 3562 wprot = prot; 3563 if (pwp->wp_read) 3564 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3565 if (pwp->wp_write) 3566 wprot &= ~PROT_WRITE; 3567 if (pwp->wp_exec) 3568 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3569 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3570 retry: 3571 seg = as_segat(as, vaddr); 3572 if (seg == NULL) { 3573 panic("as_setwatchprot: no seg"); 3574 /*NOTREACHED*/ 3575 } 3576 err = segop_setprot(seg, vaddr, PAGESIZE, wprot); 3577 if (err == IE_RETRY) { 3578 ASSERT(retrycnt == 0); 3579 retrycnt++; 3580 goto retry; 3581 } 3582 } 3583 pwp->wp_oprot = prot; 3584 pwp->wp_prot = wprot; 3585 3586 pwp = AVL_NEXT(&as->a_wpage, pwp); 3587 } 3588 } 3589 3590 /* 3591 * Clear all of the watched pages in the range. 3592 */ 3593 static void 3594 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3595 { 3596 caddr_t eaddr = addr + size; 3597 struct watched_page *pwp; 3598 struct watched_page tpw; 3599 uint_t prot; 3600 struct seg *seg; 3601 int err, retrycnt; 3602 avl_index_t where; 3603 3604 if (avl_numnodes(&as->a_wpage) == 0) 3605 return; 3606 3607 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3608 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3609 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3610 3611 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3612 3613 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3614 3615 if ((prot = pwp->wp_oprot) != 0) { 3616 retrycnt = 0; 3617 3618 if (prot != pwp->wp_prot) { 3619 retry: 3620 seg = as_segat(as, pwp->wp_vaddr); 3621 if (seg == NULL) 3622 continue; 3623 err = segop_setprot(seg, pwp->wp_vaddr, 3624 PAGESIZE, prot); 3625 if (err == IE_RETRY) { 3626 ASSERT(retrycnt == 0); 3627 retrycnt++; 3628 goto retry; 3629 3630 } 3631 } 3632 pwp->wp_oprot = 0; 3633 pwp->wp_prot = 0; 3634 } 3635 3636 pwp = AVL_NEXT(&as->a_wpage, pwp); 3637 } 3638 } 3639 3640 void 3641 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3642 { 3643 struct proc *p; 3644 3645 mutex_enter(&pidlock); 3646 for (p = practive; p; p = p->p_next) { 3647 if (p->p_as == as) { 3648 mutex_enter(&p->p_lock); 3649 if (p->p_as == as) 3650 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3651 mutex_exit(&p->p_lock); 3652 } 3653 } 3654 mutex_exit(&pidlock); 3655 } 3656 3657 /* 3658 * return memory object ID 3659 */ 3660 int 3661 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3662 { 3663 struct seg *seg; 3664 int sts; 3665 3666 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 3667 seg = as_segat(as, addr); 3668 if (seg == NULL) { 3669 AS_LOCK_EXIT(as, &as->a_lock); 3670 return (EFAULT); 3671 } 3672 3673 sts = segop_getmemid(seg, addr, memidp); 3674 3675 AS_LOCK_EXIT(as, &as->a_lock); 3676 return (sts); 3677 }