1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2015, Joyent, Inc. All rights reserved. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 /* 41 * VM - address spaces. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/param.h> 47 #include <sys/errno.h> 48 #include <sys/systm.h> 49 #include <sys/mman.h> 50 #include <sys/sysmacros.h> 51 #include <sys/cpuvar.h> 52 #include <sys/sysinfo.h> 53 #include <sys/kmem.h> 54 #include <sys/vnode.h> 55 #include <sys/vmsystm.h> 56 #include <sys/cmn_err.h> 57 #include <sys/debug.h> 58 #include <sys/tnf_probe.h> 59 #include <sys/vtrace.h> 60 61 #include <vm/hat.h> 62 #include <vm/xhat.h> 63 #include <vm/as.h> 64 #include <vm/seg.h> 65 #include <vm/seg_vn.h> 66 #include <vm/seg_dev.h> 67 #include <vm/seg_kmem.h> 68 #include <vm/seg_map.h> 69 #include <vm/seg_spt.h> 70 #include <vm/page.h> 71 72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 73 74 static struct kmem_cache *as_cache; 75 76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 77 static void as_clearwatchprot(struct as *, caddr_t, size_t); 78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 79 80 81 /* 82 * Verifying the segment lists is very time-consuming; it may not be 83 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 84 */ 85 #ifdef DEBUG 86 #define VERIFY_SEGLIST 87 int do_as_verify = 0; 88 #endif 89 90 /* 91 * Allocate a new callback data structure entry and fill in the events of 92 * interest, the address range of interest, and the callback argument. 93 * Link the entry on the as->a_callbacks list. A callback entry for the 94 * entire address space may be specified with vaddr = 0 and size = -1. 95 * 96 * CALLERS RESPONSIBILITY: If not calling from within the process context for 97 * the specified as, the caller must guarantee persistence of the specified as 98 * for the duration of this function (eg. pages being locked within the as 99 * will guarantee persistence). 100 */ 101 int 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 103 caddr_t vaddr, size_t size, int sleepflag) 104 { 105 struct as_callback *current_head, *cb; 106 caddr_t saddr; 107 size_t rsize; 108 109 /* callback function and an event are mandatory */ 110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 111 return (EINVAL); 112 113 /* Adding a callback after as_free has been called is not allowed */ 114 if (as == &kas) 115 return (ENOMEM); 116 117 /* 118 * vaddr = 0 and size = -1 is used to indicate that the callback range 119 * is the entire address space so no rounding is done in that case. 120 */ 121 if (size != -1) { 122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 124 (size_t)saddr; 125 /* check for wraparound */ 126 if (saddr + rsize < saddr) 127 return (ENOMEM); 128 } else { 129 if (vaddr != 0) 130 return (EINVAL); 131 saddr = vaddr; 132 rsize = size; 133 } 134 135 /* Allocate and initialize a callback entry */ 136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 137 if (cb == NULL) 138 return (EAGAIN); 139 140 cb->ascb_func = cb_func; 141 cb->ascb_arg = arg; 142 cb->ascb_events = events; 143 cb->ascb_saddr = saddr; 144 cb->ascb_len = rsize; 145 146 /* Add the entry to the list */ 147 mutex_enter(&as->a_contents); 148 current_head = as->a_callbacks; 149 as->a_callbacks = cb; 150 cb->ascb_next = current_head; 151 152 /* 153 * The call to this function may lose in a race with 154 * a pertinent event - eg. a thread does long term memory locking 155 * but before the callback is added another thread executes as_unmap. 156 * A broadcast here resolves that. 157 */ 158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 159 AS_CLRUNMAPWAIT(as); 160 cv_broadcast(&as->a_cv); 161 } 162 163 mutex_exit(&as->a_contents); 164 return (0); 165 } 166 167 /* 168 * Search the callback list for an entry which pertains to arg. 169 * 170 * This is called from within the client upon completion of the callback. 171 * RETURN VALUES: 172 * AS_CALLBACK_DELETED (callback entry found and deleted) 173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 175 * entry will be made in as_do_callbacks) 176 * 177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 178 * set, it indicates that as_do_callbacks is processing this entry. The 179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 180 * to unblock as_do_callbacks, in case it is blocked. 181 * 182 * CALLERS RESPONSIBILITY: If not calling from within the process context for 183 * the specified as, the caller must guarantee persistence of the specified as 184 * for the duration of this function (eg. pages being locked within the as 185 * will guarantee persistence). 186 */ 187 uint_t 188 as_delete_callback(struct as *as, void *arg) 189 { 190 struct as_callback **prevcb = &as->a_callbacks; 191 struct as_callback *cb; 192 uint_t rc = AS_CALLBACK_NOTFOUND; 193 194 mutex_enter(&as->a_contents); 195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 196 if (cb->ascb_arg != arg) 197 continue; 198 199 /* 200 * If the events indicate AS_CALLBACK_CALLED, just clear 201 * AS_ALL_EVENT in the events field and wakeup the thread 202 * that may be waiting in as_do_callbacks. as_do_callbacks 203 * will take care of removing this entry from the list. In 204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 205 * (AS_CALLBACK_CALLED not set), just remove it from the 206 * list, return the memory and return AS_CALLBACK_DELETED. 207 */ 208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 209 /* leave AS_CALLBACK_CALLED */ 210 cb->ascb_events &= ~AS_ALL_EVENT; 211 rc = AS_CALLBACK_DELETE_DEFERRED; 212 cv_broadcast(&as->a_cv); 213 } else { 214 *prevcb = cb->ascb_next; 215 kmem_free(cb, sizeof (struct as_callback)); 216 rc = AS_CALLBACK_DELETED; 217 } 218 break; 219 } 220 mutex_exit(&as->a_contents); 221 return (rc); 222 } 223 224 /* 225 * Searches the as callback list for a matching entry. 226 * Returns a pointer to the first matching callback, or NULL if 227 * nothing is found. 228 * This function never sleeps so it is ok to call it with more 229 * locks held but the (required) a_contents mutex. 230 * 231 * See also comment on as_do_callbacks below. 232 */ 233 static struct as_callback * 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 235 size_t event_len) 236 { 237 struct as_callback *cb; 238 239 ASSERT(MUTEX_HELD(&as->a_contents)); 240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 241 /* 242 * If the callback has not already been called, then 243 * check if events or address range pertains. An event_len 244 * of zero means do an unconditional callback. 245 */ 246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 247 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 248 (event_addr + event_len < cb->ascb_saddr) || 249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 250 continue; 251 } 252 break; 253 } 254 return (cb); 255 } 256 257 /* 258 * Executes a given callback and removes it from the callback list for 259 * this address space. 260 * This function may sleep so the caller must drop all locks except 261 * a_contents before calling this func. 262 * 263 * See also comments on as_do_callbacks below. 264 */ 265 static void 266 as_execute_callback(struct as *as, struct as_callback *cb, 267 uint_t events) 268 { 269 struct as_callback **prevcb; 270 void *cb_arg; 271 272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 273 cb->ascb_events |= AS_CALLBACK_CALLED; 274 mutex_exit(&as->a_contents); 275 (*cb->ascb_func)(as, cb->ascb_arg, events); 276 mutex_enter(&as->a_contents); 277 /* 278 * the callback function is required to delete the callback 279 * when the callback function determines it is OK for 280 * this thread to continue. as_delete_callback will clear 281 * the AS_ALL_EVENT in the events field when it is deleted. 282 * If the callback function called as_delete_callback, 283 * events will already be cleared and there will be no blocking. 284 */ 285 while ((cb->ascb_events & events) != 0) { 286 cv_wait(&as->a_cv, &as->a_contents); 287 } 288 /* 289 * This entry needs to be taken off the list. Normally, the 290 * callback func itself does that, but unfortunately the list 291 * may have changed while the callback was running because the 292 * a_contents mutex was dropped and someone else other than the 293 * callback func itself could have called as_delete_callback, 294 * so we have to search to find this entry again. The entry 295 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 296 */ 297 cb_arg = cb->ascb_arg; 298 prevcb = &as->a_callbacks; 299 for (cb = as->a_callbacks; cb != NULL; 300 prevcb = &cb->ascb_next, cb = *prevcb) { 301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 302 (cb_arg != cb->ascb_arg)) { 303 continue; 304 } 305 *prevcb = cb->ascb_next; 306 kmem_free(cb, sizeof (struct as_callback)); 307 break; 308 } 309 } 310 311 /* 312 * Check the callback list for a matching event and intersection of 313 * address range. If there is a match invoke the callback. Skip an entry if: 314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 315 * - not event of interest 316 * - not address range of interest 317 * 318 * An event_len of zero indicates a request for an unconditional callback 319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 320 * a_contents lock must be dropped before a callback, so only one callback 321 * can be done before returning. Return -1 (true) if a callback was 322 * executed and removed from the list, else return 0 (false). 323 * 324 * The logically separate parts, i.e. finding a matching callback and 325 * executing a given callback have been separated into two functions 326 * so that they can be called with different sets of locks held beyond 327 * the always-required a_contents. as_find_callback does not sleep so 328 * it is ok to call it if more locks than a_contents (i.e. the a_lock 329 * rwlock) are held. as_execute_callback on the other hand may sleep 330 * so all locks beyond a_contents must be dropped by the caller if one 331 * does not want to end comatose. 332 */ 333 static int 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 335 size_t event_len) 336 { 337 struct as_callback *cb; 338 339 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 340 as_execute_callback(as, cb, events); 341 return (-1); 342 } 343 return (0); 344 } 345 346 /* 347 * Search for the segment containing addr. If a segment containing addr 348 * exists, that segment is returned. If no such segment exists, and 349 * the list spans addresses greater than addr, then the first segment 350 * whose base is greater than addr is returned; otherwise, NULL is 351 * returned unless tail is true, in which case the last element of the 352 * list is returned. 353 * 354 * a_seglast is used to cache the last found segment for repeated 355 * searches to the same addr (which happens frequently). 356 */ 357 struct seg * 358 as_findseg(struct as *as, caddr_t addr, int tail) 359 { 360 struct seg *seg = as->a_seglast; 361 avl_index_t where; 362 363 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 364 365 if (seg != NULL && 366 seg->s_base <= addr && 367 addr < seg->s_base + seg->s_size) 368 return (seg); 369 370 seg = avl_find(&as->a_segtree, &addr, &where); 371 if (seg != NULL) 372 return (as->a_seglast = seg); 373 374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 375 if (seg == NULL && tail) 376 seg = avl_last(&as->a_segtree); 377 return (as->a_seglast = seg); 378 } 379 380 #ifdef VERIFY_SEGLIST 381 /* 382 * verify that the linked list is coherent 383 */ 384 static void 385 as_verify(struct as *as) 386 { 387 struct seg *seg, *seglast, *p, *n; 388 uint_t nsegs = 0; 389 390 if (do_as_verify == 0) 391 return; 392 393 seglast = as->a_seglast; 394 395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 396 ASSERT(seg->s_as == as); 397 p = AS_SEGPREV(as, seg); 398 n = AS_SEGNEXT(as, seg); 399 ASSERT(p == NULL || p->s_as == as); 400 ASSERT(p == NULL || p->s_base < seg->s_base); 401 ASSERT(n == NULL || n->s_base > seg->s_base); 402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 403 if (seg == seglast) 404 seglast = NULL; 405 nsegs++; 406 } 407 ASSERT(seglast == NULL); 408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 409 } 410 #endif /* VERIFY_SEGLIST */ 411 412 /* 413 * Add a new segment to the address space. The avl_find() 414 * may be expensive so we attempt to use last segment accessed 415 * in as_gap() as an insertion point. 416 */ 417 int 418 as_addseg(struct as *as, struct seg *newseg) 419 { 420 struct seg *seg; 421 caddr_t addr; 422 caddr_t eaddr; 423 avl_index_t where; 424 425 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 426 427 as->a_updatedir = 1; /* inform /proc */ 428 gethrestime(&as->a_updatetime); 429 430 if (as->a_lastgaphl != NULL) { 431 struct seg *hseg = NULL; 432 struct seg *lseg = NULL; 433 434 if (as->a_lastgaphl->s_base > newseg->s_base) { 435 hseg = as->a_lastgaphl; 436 lseg = AVL_PREV(&as->a_segtree, hseg); 437 } else { 438 lseg = as->a_lastgaphl; 439 hseg = AVL_NEXT(&as->a_segtree, lseg); 440 } 441 442 if (hseg && lseg && lseg->s_base < newseg->s_base && 443 hseg->s_base > newseg->s_base) { 444 avl_insert_here(&as->a_segtree, newseg, lseg, 445 AVL_AFTER); 446 as->a_lastgaphl = NULL; 447 as->a_seglast = newseg; 448 return (0); 449 } 450 as->a_lastgaphl = NULL; 451 } 452 453 addr = newseg->s_base; 454 eaddr = addr + newseg->s_size; 455 again: 456 457 seg = avl_find(&as->a_segtree, &addr, &where); 458 459 if (seg == NULL) 460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 461 462 if (seg == NULL) 463 seg = avl_last(&as->a_segtree); 464 465 if (seg != NULL) { 466 caddr_t base = seg->s_base; 467 468 /* 469 * If top of seg is below the requested address, then 470 * the insertion point is at the end of the linked list, 471 * and seg points to the tail of the list. Otherwise, 472 * the insertion point is immediately before seg. 473 */ 474 if (base + seg->s_size > addr) { 475 if (addr >= base || eaddr > base) { 476 #ifdef __sparc 477 extern struct seg_ops segnf_ops; 478 479 /* 480 * no-fault segs must disappear if overlaid. 481 * XXX need new segment type so 482 * we don't have to check s_ops 483 */ 484 if (seg->s_ops == &segnf_ops) { 485 seg_unmap(seg); 486 goto again; 487 } 488 #endif 489 return (-1); /* overlapping segment */ 490 } 491 } 492 } 493 as->a_seglast = newseg; 494 avl_insert(&as->a_segtree, newseg, where); 495 496 #ifdef VERIFY_SEGLIST 497 as_verify(as); 498 #endif 499 return (0); 500 } 501 502 struct seg * 503 as_removeseg(struct as *as, struct seg *seg) 504 { 505 avl_tree_t *t; 506 507 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 508 509 as->a_updatedir = 1; /* inform /proc */ 510 gethrestime(&as->a_updatetime); 511 512 if (seg == NULL) 513 return (NULL); 514 515 t = &as->a_segtree; 516 if (as->a_seglast == seg) 517 as->a_seglast = NULL; 518 as->a_lastgaphl = NULL; 519 520 /* 521 * if this segment is at an address higher than 522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 523 */ 524 if (as->a_lastgap && 525 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 526 as->a_lastgap = AVL_NEXT(t, seg); 527 528 /* 529 * remove the segment from the seg tree 530 */ 531 avl_remove(t, seg); 532 533 #ifdef VERIFY_SEGLIST 534 as_verify(as); 535 #endif 536 return (seg); 537 } 538 539 /* 540 * Find a segment containing addr. 541 */ 542 struct seg * 543 as_segat(struct as *as, caddr_t addr) 544 { 545 struct seg *seg = as->a_seglast; 546 547 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 548 549 if (seg != NULL && seg->s_base <= addr && 550 addr < seg->s_base + seg->s_size) 551 return (seg); 552 553 seg = avl_find(&as->a_segtree, &addr, NULL); 554 return (seg); 555 } 556 557 /* 558 * Serialize all searches for holes in an address space to 559 * prevent two or more threads from allocating the same virtual 560 * address range. The address space must not be "read/write" 561 * locked by the caller since we may block. 562 */ 563 void 564 as_rangelock(struct as *as) 565 { 566 mutex_enter(&as->a_contents); 567 while (AS_ISCLAIMGAP(as)) 568 cv_wait(&as->a_cv, &as->a_contents); 569 AS_SETCLAIMGAP(as); 570 mutex_exit(&as->a_contents); 571 } 572 573 /* 574 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 575 */ 576 void 577 as_rangeunlock(struct as *as) 578 { 579 mutex_enter(&as->a_contents); 580 AS_CLRCLAIMGAP(as); 581 cv_signal(&as->a_cv); 582 mutex_exit(&as->a_contents); 583 } 584 585 /* 586 * compar segments (or just an address) by segment address range 587 */ 588 static int 589 as_segcompar(const void *x, const void *y) 590 { 591 struct seg *a = (struct seg *)x; 592 struct seg *b = (struct seg *)y; 593 594 if (a->s_base < b->s_base) 595 return (-1); 596 if (a->s_base >= b->s_base + b->s_size) 597 return (1); 598 return (0); 599 } 600 601 602 void 603 as_avlinit(struct as *as) 604 { 605 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 606 offsetof(struct seg, s_tree)); 607 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 608 offsetof(struct watched_page, wp_link)); 609 } 610 611 /*ARGSUSED*/ 612 static int 613 as_constructor(void *buf, void *cdrarg, int kmflags) 614 { 615 struct as *as = buf; 616 617 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 618 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 619 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 620 as_avlinit(as); 621 return (0); 622 } 623 624 /*ARGSUSED1*/ 625 static void 626 as_destructor(void *buf, void *cdrarg) 627 { 628 struct as *as = buf; 629 630 avl_destroy(&as->a_segtree); 631 mutex_destroy(&as->a_contents); 632 cv_destroy(&as->a_cv); 633 rw_destroy(&as->a_lock); 634 } 635 636 void 637 as_init(void) 638 { 639 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 640 as_constructor, as_destructor, NULL, NULL, NULL, 0); 641 } 642 643 /* 644 * Allocate and initialize an address space data structure. 645 * We call hat_alloc to allow any machine dependent 646 * information in the hat structure to be initialized. 647 */ 648 struct as * 649 as_alloc(void) 650 { 651 struct as *as; 652 653 as = kmem_cache_alloc(as_cache, KM_SLEEP); 654 655 as->a_flags = 0; 656 as->a_vbits = 0; 657 as->a_hrm = NULL; 658 as->a_seglast = NULL; 659 as->a_size = 0; 660 as->a_resvsize = 0; 661 as->a_updatedir = 0; 662 gethrestime(&as->a_updatetime); 663 as->a_objectdir = NULL; 664 as->a_sizedir = 0; 665 as->a_userlimit = (caddr_t)USERLIMIT; 666 as->a_lastgap = NULL; 667 as->a_lastgaphl = NULL; 668 as->a_callbacks = NULL; 669 670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 672 AS_LOCK_EXIT(as, &as->a_lock); 673 674 as->a_xhat = NULL; 675 676 return (as); 677 } 678 679 /* 680 * Free an address space data structure. 681 * Need to free the hat first and then 682 * all the segments on this as and finally 683 * the space for the as struct itself. 684 */ 685 void 686 as_free(struct as *as) 687 { 688 struct hat *hat = as->a_hat; 689 struct seg *seg, *next; 690 int called = 0; 691 692 top: 693 /* 694 * Invoke ALL callbacks. as_do_callbacks will do one callback 695 * per call, and not return (-1) until the callback has completed. 696 * When as_do_callbacks returns zero, all callbacks have completed. 697 */ 698 mutex_enter(&as->a_contents); 699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 700 ; 701 702 /* This will prevent new XHATs from attaching to as */ 703 if (!called) 704 AS_SETBUSY(as); 705 mutex_exit(&as->a_contents); 706 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 707 708 if (!called) { 709 called = 1; 710 hat_free_start(hat); 711 if (as->a_xhat != NULL) 712 xhat_free_start_all(as); 713 } 714 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 715 int err; 716 717 next = AS_SEGNEXT(as, seg); 718 retry: 719 err = segop_unmap(seg, seg->s_base, seg->s_size); 720 if (err == EAGAIN) { 721 mutex_enter(&as->a_contents); 722 if (as->a_callbacks) { 723 AS_LOCK_EXIT(as, &as->a_lock); 724 } else if (!AS_ISNOUNMAPWAIT(as)) { 725 /* 726 * Memory is currently locked. Wait for a 727 * cv_signal that it has been unlocked, then 728 * try the operation again. 729 */ 730 if (AS_ISUNMAPWAIT(as) == 0) 731 cv_broadcast(&as->a_cv); 732 AS_SETUNMAPWAIT(as); 733 AS_LOCK_EXIT(as, &as->a_lock); 734 while (AS_ISUNMAPWAIT(as)) 735 cv_wait(&as->a_cv, &as->a_contents); 736 } else { 737 /* 738 * We may have raced with 739 * segvn_reclaim()/segspt_reclaim(). In this 740 * case clean nounmapwait flag and retry since 741 * softlockcnt in this segment may be already 742 * 0. We don't drop as writer lock so our 743 * number of retries without sleeping should 744 * be very small. See segvn_reclaim() for 745 * more comments. 746 */ 747 AS_CLRNOUNMAPWAIT(as); 748 mutex_exit(&as->a_contents); 749 goto retry; 750 } 751 mutex_exit(&as->a_contents); 752 goto top; 753 } else { 754 /* 755 * We do not expect any other error return at this 756 * time. This is similar to an ASSERT in seg_unmap() 757 */ 758 ASSERT(err == 0); 759 } 760 } 761 hat_free_end(hat); 762 if (as->a_xhat != NULL) 763 xhat_free_end_all(as); 764 AS_LOCK_EXIT(as, &as->a_lock); 765 766 /* /proc stuff */ 767 ASSERT(avl_numnodes(&as->a_wpage) == 0); 768 if (as->a_objectdir) { 769 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 770 as->a_objectdir = NULL; 771 as->a_sizedir = 0; 772 } 773 774 /* 775 * Free the struct as back to kmem. Assert it has no segments. 776 */ 777 ASSERT(avl_numnodes(&as->a_segtree) == 0); 778 kmem_cache_free(as_cache, as); 779 } 780 781 int 782 as_dup(struct as *as, struct proc *forkedproc) 783 { 784 struct as *newas; 785 struct seg *seg, *newseg; 786 size_t purgesize = 0; 787 int error; 788 789 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 790 as_clearwatch(as); 791 newas = as_alloc(); 792 newas->a_userlimit = as->a_userlimit; 793 newas->a_proc = forkedproc; 794 795 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); 796 797 /* This will prevent new XHATs from attaching */ 798 mutex_enter(&as->a_contents); 799 AS_SETBUSY(as); 800 mutex_exit(&as->a_contents); 801 mutex_enter(&newas->a_contents); 802 AS_SETBUSY(newas); 803 mutex_exit(&newas->a_contents); 804 805 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 806 807 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 808 809 if (seg->s_flags & S_PURGE) { 810 purgesize += seg->s_size; 811 continue; 812 } 813 814 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 815 if (newseg == NULL) { 816 AS_LOCK_EXIT(newas, &newas->a_lock); 817 as_setwatch(as); 818 mutex_enter(&as->a_contents); 819 AS_CLRBUSY(as); 820 mutex_exit(&as->a_contents); 821 AS_LOCK_EXIT(as, &as->a_lock); 822 as_free(newas); 823 return (-1); 824 } 825 if ((error = segop_dup(seg, newseg)) != 0) { 826 /* 827 * We call seg_free() on the new seg 828 * because the segment is not set up 829 * completely; i.e. it has no ops. 830 */ 831 as_setwatch(as); 832 mutex_enter(&as->a_contents); 833 AS_CLRBUSY(as); 834 mutex_exit(&as->a_contents); 835 AS_LOCK_EXIT(as, &as->a_lock); 836 seg_free(newseg); 837 AS_LOCK_EXIT(newas, &newas->a_lock); 838 as_free(newas); 839 return (error); 840 } 841 newas->a_size += seg->s_size; 842 } 843 newas->a_resvsize = as->a_resvsize - purgesize; 844 845 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 846 if (as->a_xhat != NULL) 847 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); 848 849 mutex_enter(&newas->a_contents); 850 AS_CLRBUSY(newas); 851 mutex_exit(&newas->a_contents); 852 AS_LOCK_EXIT(newas, &newas->a_lock); 853 854 as_setwatch(as); 855 mutex_enter(&as->a_contents); 856 AS_CLRBUSY(as); 857 mutex_exit(&as->a_contents); 858 AS_LOCK_EXIT(as, &as->a_lock); 859 if (error != 0) { 860 as_free(newas); 861 return (error); 862 } 863 forkedproc->p_as = newas; 864 return (0); 865 } 866 867 /* 868 * Handle a ``fault'' at addr for size bytes. 869 */ 870 faultcode_t 871 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 872 enum fault_type type, enum seg_rw rw) 873 { 874 struct seg *seg; 875 caddr_t raddr; /* rounded down addr */ 876 size_t rsize; /* rounded up size */ 877 size_t ssize; 878 faultcode_t res = 0; 879 caddr_t addrsav; 880 struct seg *segsav; 881 int as_lock_held; 882 klwp_t *lwp = ttolwp(curthread); 883 int is_xhat = 0; 884 int holding_wpage = 0; 885 extern struct seg_ops segdev_ops; 886 887 888 889 if (as->a_hat != hat) { 890 /* This must be an XHAT then */ 891 is_xhat = 1; 892 893 if ((type != F_INVAL) || (as == &kas)) 894 return (FC_NOSUPPORT); 895 } 896 897 retry: 898 if (!is_xhat) { 899 /* 900 * Indicate that the lwp is not to be stopped while waiting 901 * for a pagefault. This is to avoid deadlock while debugging 902 * a process via /proc over NFS (in particular). 903 */ 904 if (lwp != NULL) 905 lwp->lwp_nostop++; 906 907 /* 908 * same length must be used when we softlock and softunlock. 909 * We don't support softunlocking lengths less than 910 * the original length when there is largepage support. 911 * See seg_dev.c for more comments. 912 */ 913 switch (type) { 914 915 case F_SOFTLOCK: 916 CPU_STATS_ADD_K(vm, softlock, 1); 917 break; 918 919 case F_SOFTUNLOCK: 920 break; 921 922 case F_PROT: 923 CPU_STATS_ADD_K(vm, prot_fault, 1); 924 break; 925 926 case F_INVAL: 927 CPU_STATS_ENTER_K(); 928 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 929 if (as == &kas) 930 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 931 CPU_STATS_EXIT_K(); 932 break; 933 } 934 } 935 936 /* Kernel probe */ 937 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 938 tnf_opaque, address, addr, 939 tnf_fault_type, fault_type, type, 940 tnf_seg_access, access, rw); 941 942 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 943 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 944 (size_t)raddr; 945 946 /* 947 * XXX -- Don't grab the as lock for segkmap. We should grab it for 948 * correctness, but then we could be stuck holding this lock for 949 * a LONG time if the fault needs to be resolved on a slow 950 * filesystem, and then no-one will be able to exec new commands, 951 * as exec'ing requires the write lock on the as. 952 */ 953 if (as == &kas && segkmap && segkmap->s_base <= raddr && 954 raddr + size < segkmap->s_base + segkmap->s_size) { 955 /* 956 * if (as==&kas), this can't be XHAT: we've already returned 957 * FC_NOSUPPORT. 958 */ 959 seg = segkmap; 960 as_lock_held = 0; 961 } else { 962 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 963 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { 964 /* 965 * Grab and hold the writers' lock on the as 966 * if the fault is to a watched page. 967 * This will keep CPUs from "peeking" at the 968 * address range while we're temporarily boosting 969 * the permissions for the XHAT device to 970 * resolve the fault in the segment layer. 971 * 972 * We could check whether faulted address 973 * is within a watched page and only then grab 974 * the writer lock, but this is simpler. 975 */ 976 AS_LOCK_EXIT(as, &as->a_lock); 977 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 978 } 979 980 seg = as_segat(as, raddr); 981 if (seg == NULL) { 982 AS_LOCK_EXIT(as, &as->a_lock); 983 if ((lwp != NULL) && (!is_xhat)) 984 lwp->lwp_nostop--; 985 return (FC_NOMAP); 986 } 987 988 as_lock_held = 1; 989 } 990 991 addrsav = raddr; 992 segsav = seg; 993 994 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 995 if (raddr >= seg->s_base + seg->s_size) { 996 seg = AS_SEGNEXT(as, seg); 997 if (seg == NULL || raddr != seg->s_base) { 998 res = FC_NOMAP; 999 break; 1000 } 1001 } 1002 if (raddr + rsize > seg->s_base + seg->s_size) 1003 ssize = seg->s_base + seg->s_size - raddr; 1004 else 1005 ssize = rsize; 1006 1007 if (!is_xhat || (seg->s_ops != &segdev_ops)) { 1008 1009 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && 1010 pr_is_watchpage_as(raddr, rw, as)) { 1011 /* 1012 * Handle watch pages. If we're faulting on a 1013 * watched page from an X-hat, we have to 1014 * restore the original permissions while we 1015 * handle the fault. 1016 */ 1017 as_clearwatch(as); 1018 holding_wpage = 1; 1019 } 1020 1021 res = segop_fault(hat, seg, raddr, ssize, type, rw); 1022 1023 /* Restore watchpoints */ 1024 if (holding_wpage) { 1025 as_setwatch(as); 1026 holding_wpage = 0; 1027 } 1028 1029 if (res != 0) 1030 break; 1031 } else { 1032 /* XHAT does not support seg_dev */ 1033 res = FC_NOSUPPORT; 1034 break; 1035 } 1036 } 1037 1038 /* 1039 * If we were SOFTLOCKing and encountered a failure, 1040 * we must SOFTUNLOCK the range we already did. (Maybe we 1041 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 1042 * right here...) 1043 */ 1044 if (res != 0 && type == F_SOFTLOCK) { 1045 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 1046 if (addrsav >= seg->s_base + seg->s_size) 1047 seg = AS_SEGNEXT(as, seg); 1048 ASSERT(seg != NULL); 1049 /* 1050 * Now call the fault routine again to perform the 1051 * unlock using S_OTHER instead of the rw variable 1052 * since we never got a chance to touch the pages. 1053 */ 1054 if (raddr > seg->s_base + seg->s_size) 1055 ssize = seg->s_base + seg->s_size - addrsav; 1056 else 1057 ssize = raddr - addrsav; 1058 (void) segop_fault(hat, seg, addrsav, ssize, 1059 F_SOFTUNLOCK, S_OTHER); 1060 } 1061 } 1062 if (as_lock_held) 1063 AS_LOCK_EXIT(as, &as->a_lock); 1064 if ((lwp != NULL) && (!is_xhat)) 1065 lwp->lwp_nostop--; 1066 1067 /* 1068 * If the lower levels returned EDEADLK for a fault, 1069 * It means that we should retry the fault. Let's wait 1070 * a bit also to let the deadlock causing condition clear. 1071 * This is part of a gross hack to work around a design flaw 1072 * in the ufs/sds logging code and should go away when the 1073 * logging code is re-designed to fix the problem. See bug 1074 * 4125102 for details of the problem. 1075 */ 1076 if (FC_ERRNO(res) == EDEADLK) { 1077 delay(deadlk_wait); 1078 res = 0; 1079 goto retry; 1080 } 1081 return (res); 1082 } 1083 1084 1085 1086 /* 1087 * Asynchronous ``fault'' at addr for size bytes. 1088 */ 1089 faultcode_t 1090 as_faulta(struct as *as, caddr_t addr, size_t size) 1091 { 1092 struct seg *seg; 1093 caddr_t raddr; /* rounded down addr */ 1094 size_t rsize; /* rounded up size */ 1095 faultcode_t res = 0; 1096 klwp_t *lwp = ttolwp(curthread); 1097 1098 retry: 1099 /* 1100 * Indicate that the lwp is not to be stopped while waiting 1101 * for a pagefault. This is to avoid deadlock while debugging 1102 * a process via /proc over NFS (in particular). 1103 */ 1104 if (lwp != NULL) 1105 lwp->lwp_nostop++; 1106 1107 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1108 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1109 (size_t)raddr; 1110 1111 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1112 seg = as_segat(as, raddr); 1113 if (seg == NULL) { 1114 AS_LOCK_EXIT(as, &as->a_lock); 1115 if (lwp != NULL) 1116 lwp->lwp_nostop--; 1117 return (FC_NOMAP); 1118 } 1119 1120 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1121 if (raddr >= seg->s_base + seg->s_size) { 1122 seg = AS_SEGNEXT(as, seg); 1123 if (seg == NULL || raddr != seg->s_base) { 1124 res = FC_NOMAP; 1125 break; 1126 } 1127 } 1128 res = segop_faulta(seg, raddr); 1129 if (res != 0) 1130 break; 1131 } 1132 AS_LOCK_EXIT(as, &as->a_lock); 1133 if (lwp != NULL) 1134 lwp->lwp_nostop--; 1135 /* 1136 * If the lower levels returned EDEADLK for a fault, 1137 * It means that we should retry the fault. Let's wait 1138 * a bit also to let the deadlock causing condition clear. 1139 * This is part of a gross hack to work around a design flaw 1140 * in the ufs/sds logging code and should go away when the 1141 * logging code is re-designed to fix the problem. See bug 1142 * 4125102 for details of the problem. 1143 */ 1144 if (FC_ERRNO(res) == EDEADLK) { 1145 delay(deadlk_wait); 1146 res = 0; 1147 goto retry; 1148 } 1149 return (res); 1150 } 1151 1152 /* 1153 * Set the virtual mapping for the interval from [addr : addr + size) 1154 * in address space `as' to have the specified protection. 1155 * It is ok for the range to cross over several segments, 1156 * as long as they are contiguous. 1157 */ 1158 int 1159 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1160 { 1161 struct seg *seg; 1162 struct as_callback *cb; 1163 size_t ssize; 1164 caddr_t raddr; /* rounded down addr */ 1165 size_t rsize; /* rounded up size */ 1166 int error = 0, writer = 0; 1167 caddr_t saveraddr; 1168 size_t saversize; 1169 1170 setprot_top: 1171 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1172 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1173 (size_t)raddr; 1174 1175 if (raddr + rsize < raddr) /* check for wraparound */ 1176 return (ENOMEM); 1177 1178 saveraddr = raddr; 1179 saversize = rsize; 1180 1181 /* 1182 * Normally we only lock the as as a reader. But 1183 * if due to setprot the segment driver needs to split 1184 * a segment it will return IE_RETRY. Therefore we re-acquire 1185 * the as lock as a writer so the segment driver can change 1186 * the seg list. Also the segment driver will return IE_RETRY 1187 * after it has changed the segment list so we therefore keep 1188 * locking as a writer. Since these opeartions should be rare 1189 * want to only lock as a writer when necessary. 1190 */ 1191 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1192 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1193 } else { 1194 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1195 } 1196 1197 as_clearwatchprot(as, raddr, rsize); 1198 seg = as_segat(as, raddr); 1199 if (seg == NULL) { 1200 as_setwatch(as); 1201 AS_LOCK_EXIT(as, &as->a_lock); 1202 return (ENOMEM); 1203 } 1204 1205 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1206 if (raddr >= seg->s_base + seg->s_size) { 1207 seg = AS_SEGNEXT(as, seg); 1208 if (seg == NULL || raddr != seg->s_base) { 1209 error = ENOMEM; 1210 break; 1211 } 1212 } 1213 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1214 ssize = seg->s_base + seg->s_size - raddr; 1215 else 1216 ssize = rsize; 1217 retry: 1218 error = segop_setprot(seg, raddr, ssize, prot); 1219 1220 if (error == IE_NOMEM) { 1221 error = EAGAIN; 1222 break; 1223 } 1224 1225 if (error == IE_RETRY) { 1226 AS_LOCK_EXIT(as, &as->a_lock); 1227 writer = 1; 1228 goto setprot_top; 1229 } 1230 1231 if (error == EAGAIN) { 1232 /* 1233 * Make sure we have a_lock as writer. 1234 */ 1235 if (writer == 0) { 1236 AS_LOCK_EXIT(as, &as->a_lock); 1237 writer = 1; 1238 goto setprot_top; 1239 } 1240 1241 /* 1242 * Memory is currently locked. It must be unlocked 1243 * before this operation can succeed through a retry. 1244 * The possible reasons for locked memory and 1245 * corresponding strategies for unlocking are: 1246 * (1) Normal I/O 1247 * wait for a signal that the I/O operation 1248 * has completed and the memory is unlocked. 1249 * (2) Asynchronous I/O 1250 * The aio subsystem does not unlock pages when 1251 * the I/O is completed. Those pages are unlocked 1252 * when the application calls aiowait/aioerror. 1253 * So, to prevent blocking forever, cv_broadcast() 1254 * is done to wake up aio_cleanup_thread. 1255 * Subsequently, segvn_reclaim will be called, and 1256 * that will do AS_CLRUNMAPWAIT() and wake us up. 1257 * (3) Long term page locking: 1258 * Drivers intending to have pages locked for a 1259 * period considerably longer than for normal I/O 1260 * (essentially forever) may have registered for a 1261 * callback so they may unlock these pages on 1262 * request. This is needed to allow this operation 1263 * to succeed. Each entry on the callback list is 1264 * examined. If the event or address range pertains 1265 * the callback is invoked (unless it already is in 1266 * progress). The a_contents lock must be dropped 1267 * before the callback, so only one callback can 1268 * be done at a time. Go to the top and do more 1269 * until zero is returned. If zero is returned, 1270 * either there were no callbacks for this event 1271 * or they were already in progress. 1272 */ 1273 mutex_enter(&as->a_contents); 1274 if (as->a_callbacks && 1275 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1276 seg->s_base, seg->s_size))) { 1277 AS_LOCK_EXIT(as, &as->a_lock); 1278 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1279 } else if (!AS_ISNOUNMAPWAIT(as)) { 1280 if (AS_ISUNMAPWAIT(as) == 0) 1281 cv_broadcast(&as->a_cv); 1282 AS_SETUNMAPWAIT(as); 1283 AS_LOCK_EXIT(as, &as->a_lock); 1284 while (AS_ISUNMAPWAIT(as)) 1285 cv_wait(&as->a_cv, &as->a_contents); 1286 } else { 1287 /* 1288 * We may have raced with 1289 * segvn_reclaim()/segspt_reclaim(). In this 1290 * case clean nounmapwait flag and retry since 1291 * softlockcnt in this segment may be already 1292 * 0. We don't drop as writer lock so our 1293 * number of retries without sleeping should 1294 * be very small. See segvn_reclaim() for 1295 * more comments. 1296 */ 1297 AS_CLRNOUNMAPWAIT(as); 1298 mutex_exit(&as->a_contents); 1299 goto retry; 1300 } 1301 mutex_exit(&as->a_contents); 1302 goto setprot_top; 1303 } else if (error != 0) 1304 break; 1305 } 1306 if (error != 0) { 1307 as_setwatch(as); 1308 } else { 1309 as_setwatchprot(as, saveraddr, saversize, prot); 1310 } 1311 AS_LOCK_EXIT(as, &as->a_lock); 1312 return (error); 1313 } 1314 1315 /* 1316 * Check to make sure that the interval [addr, addr + size) 1317 * in address space `as' has at least the specified protection. 1318 * It is ok for the range to cross over several segments, as long 1319 * as they are contiguous. 1320 */ 1321 int 1322 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1323 { 1324 struct seg *seg; 1325 size_t ssize; 1326 caddr_t raddr; /* rounded down addr */ 1327 size_t rsize; /* rounded up size */ 1328 int error = 0; 1329 1330 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1331 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1332 (size_t)raddr; 1333 1334 if (raddr + rsize < raddr) /* check for wraparound */ 1335 return (ENOMEM); 1336 1337 /* 1338 * This is ugly as sin... 1339 * Normally, we only acquire the address space readers lock. 1340 * However, if the address space has watchpoints present, 1341 * we must acquire the writer lock on the address space for 1342 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1343 */ 1344 if (avl_numnodes(&as->a_wpage) != 0) 1345 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1346 else 1347 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1348 as_clearwatchprot(as, raddr, rsize); 1349 seg = as_segat(as, raddr); 1350 if (seg == NULL) { 1351 as_setwatch(as); 1352 AS_LOCK_EXIT(as, &as->a_lock); 1353 return (ENOMEM); 1354 } 1355 1356 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1357 if (raddr >= seg->s_base + seg->s_size) { 1358 seg = AS_SEGNEXT(as, seg); 1359 if (seg == NULL || raddr != seg->s_base) { 1360 error = ENOMEM; 1361 break; 1362 } 1363 } 1364 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1365 ssize = seg->s_base + seg->s_size - raddr; 1366 else 1367 ssize = rsize; 1368 1369 error = segop_checkprot(seg, raddr, ssize, prot); 1370 if (error != 0) 1371 break; 1372 } 1373 as_setwatch(as); 1374 AS_LOCK_EXIT(as, &as->a_lock); 1375 return (error); 1376 } 1377 1378 int 1379 as_unmap(struct as *as, caddr_t addr, size_t size) 1380 { 1381 struct seg *seg, *seg_next; 1382 struct as_callback *cb; 1383 caddr_t raddr, eaddr; 1384 size_t ssize, rsize = 0; 1385 int err; 1386 1387 top: 1388 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1389 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1390 (uintptr_t)PAGEMASK); 1391 1392 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1393 1394 as->a_updatedir = 1; /* inform /proc */ 1395 gethrestime(&as->a_updatetime); 1396 1397 /* 1398 * Use as_findseg to find the first segment in the range, then 1399 * step through the segments in order, following s_next. 1400 */ 1401 as_clearwatchprot(as, raddr, eaddr - raddr); 1402 1403 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1404 if (eaddr <= seg->s_base) 1405 break; /* eaddr was in a gap; all done */ 1406 1407 /* this is implied by the test above */ 1408 ASSERT(raddr < eaddr); 1409 1410 if (raddr < seg->s_base) 1411 raddr = seg->s_base; /* raddr was in a gap */ 1412 1413 if (eaddr > (seg->s_base + seg->s_size)) 1414 ssize = seg->s_base + seg->s_size - raddr; 1415 else 1416 ssize = eaddr - raddr; 1417 1418 /* 1419 * Save next segment pointer since seg can be 1420 * destroyed during the segment unmap operation. 1421 */ 1422 seg_next = AS_SEGNEXT(as, seg); 1423 1424 /* 1425 * We didn't count /dev/null mappings, so ignore them here. 1426 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again, 1427 * we have to do this check here while we have seg.) 1428 */ 1429 rsize = 0; 1430 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1431 !SEG_IS_PARTIAL_RESV(seg)) 1432 rsize = ssize; 1433 1434 retry: 1435 err = segop_unmap(seg, raddr, ssize); 1436 if (err == EAGAIN) { 1437 /* 1438 * Memory is currently locked. It must be unlocked 1439 * before this operation can succeed through a retry. 1440 * The possible reasons for locked memory and 1441 * corresponding strategies for unlocking are: 1442 * (1) Normal I/O 1443 * wait for a signal that the I/O operation 1444 * has completed and the memory is unlocked. 1445 * (2) Asynchronous I/O 1446 * The aio subsystem does not unlock pages when 1447 * the I/O is completed. Those pages are unlocked 1448 * when the application calls aiowait/aioerror. 1449 * So, to prevent blocking forever, cv_broadcast() 1450 * is done to wake up aio_cleanup_thread. 1451 * Subsequently, segvn_reclaim will be called, and 1452 * that will do AS_CLRUNMAPWAIT() and wake us up. 1453 * (3) Long term page locking: 1454 * Drivers intending to have pages locked for a 1455 * period considerably longer than for normal I/O 1456 * (essentially forever) may have registered for a 1457 * callback so they may unlock these pages on 1458 * request. This is needed to allow this operation 1459 * to succeed. Each entry on the callback list is 1460 * examined. If the event or address range pertains 1461 * the callback is invoked (unless it already is in 1462 * progress). The a_contents lock must be dropped 1463 * before the callback, so only one callback can 1464 * be done at a time. Go to the top and do more 1465 * until zero is returned. If zero is returned, 1466 * either there were no callbacks for this event 1467 * or they were already in progress. 1468 */ 1469 mutex_enter(&as->a_contents); 1470 if (as->a_callbacks && 1471 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1472 seg->s_base, seg->s_size))) { 1473 AS_LOCK_EXIT(as, &as->a_lock); 1474 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1475 } else if (!AS_ISNOUNMAPWAIT(as)) { 1476 if (AS_ISUNMAPWAIT(as) == 0) 1477 cv_broadcast(&as->a_cv); 1478 AS_SETUNMAPWAIT(as); 1479 AS_LOCK_EXIT(as, &as->a_lock); 1480 while (AS_ISUNMAPWAIT(as)) 1481 cv_wait(&as->a_cv, &as->a_contents); 1482 } else { 1483 /* 1484 * We may have raced with 1485 * segvn_reclaim()/segspt_reclaim(). In this 1486 * case clean nounmapwait flag and retry since 1487 * softlockcnt in this segment may be already 1488 * 0. We don't drop as writer lock so our 1489 * number of retries without sleeping should 1490 * be very small. See segvn_reclaim() for 1491 * more comments. 1492 */ 1493 AS_CLRNOUNMAPWAIT(as); 1494 mutex_exit(&as->a_contents); 1495 goto retry; 1496 } 1497 mutex_exit(&as->a_contents); 1498 goto top; 1499 } else if (err == IE_RETRY) { 1500 AS_LOCK_EXIT(as, &as->a_lock); 1501 goto top; 1502 } else if (err) { 1503 as_setwatch(as); 1504 AS_LOCK_EXIT(as, &as->a_lock); 1505 return (-1); 1506 } 1507 1508 as->a_size -= ssize; 1509 if (rsize) 1510 as->a_resvsize -= rsize; 1511 raddr += ssize; 1512 } 1513 AS_LOCK_EXIT(as, &as->a_lock); 1514 return (0); 1515 } 1516 1517 static int 1518 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1519 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1520 { 1521 uint_t szc; 1522 uint_t nszc; 1523 int error; 1524 caddr_t a; 1525 caddr_t eaddr; 1526 size_t segsize; 1527 struct seg *seg; 1528 size_t pgsz; 1529 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1530 uint_t save_szcvec; 1531 1532 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1533 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1534 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1535 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1536 if (!do_off) { 1537 vn_a->offset = 0; 1538 } 1539 1540 if (szcvec <= 1) { 1541 seg = seg_alloc(as, addr, size); 1542 if (seg == NULL) { 1543 return (ENOMEM); 1544 } 1545 vn_a->szc = 0; 1546 error = (*crfp)(seg, vn_a); 1547 if (error != 0) { 1548 seg_free(seg); 1549 } else { 1550 as->a_size += size; 1551 as->a_resvsize += size; 1552 } 1553 return (error); 1554 } 1555 1556 eaddr = addr + size; 1557 save_szcvec = szcvec; 1558 szcvec >>= 1; 1559 szc = 0; 1560 nszc = 0; 1561 while (szcvec) { 1562 if ((szcvec & 0x1) == 0) { 1563 nszc++; 1564 szcvec >>= 1; 1565 continue; 1566 } 1567 nszc++; 1568 pgsz = page_get_pagesize(nszc); 1569 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1570 if (a != addr) { 1571 ASSERT(a < eaddr); 1572 segsize = a - addr; 1573 seg = seg_alloc(as, addr, segsize); 1574 if (seg == NULL) { 1575 return (ENOMEM); 1576 } 1577 vn_a->szc = szc; 1578 error = (*crfp)(seg, vn_a); 1579 if (error != 0) { 1580 seg_free(seg); 1581 return (error); 1582 } 1583 as->a_size += segsize; 1584 as->a_resvsize += segsize; 1585 *segcreated = 1; 1586 if (do_off) { 1587 vn_a->offset += segsize; 1588 } 1589 addr = a; 1590 } 1591 szc = nszc; 1592 szcvec >>= 1; 1593 } 1594 1595 ASSERT(addr < eaddr); 1596 szcvec = save_szcvec | 1; /* add 8K pages */ 1597 while (szcvec) { 1598 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1599 ASSERT(a >= addr); 1600 if (a != addr) { 1601 segsize = a - addr; 1602 seg = seg_alloc(as, addr, segsize); 1603 if (seg == NULL) { 1604 return (ENOMEM); 1605 } 1606 vn_a->szc = szc; 1607 error = (*crfp)(seg, vn_a); 1608 if (error != 0) { 1609 seg_free(seg); 1610 return (error); 1611 } 1612 as->a_size += segsize; 1613 as->a_resvsize += segsize; 1614 *segcreated = 1; 1615 if (do_off) { 1616 vn_a->offset += segsize; 1617 } 1618 addr = a; 1619 } 1620 szcvec &= ~(1 << szc); 1621 if (szcvec) { 1622 szc = highbit(szcvec) - 1; 1623 pgsz = page_get_pagesize(szc); 1624 } 1625 } 1626 ASSERT(addr == eaddr); 1627 1628 return (0); 1629 } 1630 1631 static int 1632 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1633 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1634 { 1635 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1636 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1637 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1638 type, 0); 1639 int error; 1640 struct seg *seg; 1641 struct vattr va; 1642 u_offset_t eoff; 1643 size_t save_size = 0; 1644 extern size_t textrepl_size_thresh; 1645 1646 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1647 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1648 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1649 ASSERT(vn_a->vp != NULL); 1650 ASSERT(vn_a->amp == NULL); 1651 1652 again: 1653 if (szcvec <= 1) { 1654 seg = seg_alloc(as, addr, size); 1655 if (seg == NULL) { 1656 return (ENOMEM); 1657 } 1658 vn_a->szc = 0; 1659 error = (*crfp)(seg, vn_a); 1660 if (error != 0) { 1661 seg_free(seg); 1662 } else { 1663 as->a_size += size; 1664 as->a_resvsize += size; 1665 } 1666 return (error); 1667 } 1668 1669 va.va_mask = AT_SIZE; 1670 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1671 szcvec = 0; 1672 goto again; 1673 } 1674 eoff = vn_a->offset & PAGEMASK; 1675 if (eoff >= va.va_size) { 1676 szcvec = 0; 1677 goto again; 1678 } 1679 eoff += size; 1680 if (btopr(va.va_size) < btopr(eoff)) { 1681 save_size = size; 1682 size = va.va_size - (vn_a->offset & PAGEMASK); 1683 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1684 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1685 type, 0); 1686 if (szcvec <= 1) { 1687 size = save_size; 1688 goto again; 1689 } 1690 } 1691 1692 if (size > textrepl_size_thresh) { 1693 vn_a->flags |= _MAP_TEXTREPL; 1694 } 1695 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1696 segcreated); 1697 if (error != 0) { 1698 return (error); 1699 } 1700 if (save_size) { 1701 addr += size; 1702 size = save_size - size; 1703 szcvec = 0; 1704 goto again; 1705 } 1706 return (0); 1707 } 1708 1709 /* 1710 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1711 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1712 */ 1713 static int 1714 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1715 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1716 { 1717 uint_t szcvec; 1718 uchar_t type; 1719 1720 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1721 if (vn_a->type == MAP_SHARED) { 1722 type = MAPPGSZC_SHM; 1723 } else if (vn_a->type == MAP_PRIVATE) { 1724 if (vn_a->szc == AS_MAP_HEAP) { 1725 type = MAPPGSZC_HEAP; 1726 } else if (vn_a->szc == AS_MAP_STACK) { 1727 type = MAPPGSZC_STACK; 1728 } else { 1729 type = MAPPGSZC_PRIVM; 1730 } 1731 } 1732 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1733 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1734 (vn_a->flags & MAP_TEXT), type, 0); 1735 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1736 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1737 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1738 ASSERT(vn_a->vp == NULL); 1739 1740 return (as_map_segvn_segs(as, addr, size, szcvec, 1741 crfp, vn_a, segcreated)); 1742 } 1743 1744 int 1745 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) 1746 { 1747 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1748 return (as_map_locked(as, addr, size, crfp, argsp)); 1749 } 1750 1751 int 1752 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), 1753 void *argsp) 1754 { 1755 struct seg *seg = NULL; 1756 caddr_t raddr; /* rounded down addr */ 1757 size_t rsize; /* rounded up size */ 1758 int error; 1759 int unmap = 0; 1760 struct proc *p = curproc; 1761 struct segvn_crargs crargs; 1762 1763 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1764 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1765 (size_t)raddr; 1766 1767 /* 1768 * check for wrap around 1769 */ 1770 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1771 AS_LOCK_EXIT(as, &as->a_lock); 1772 return (ENOMEM); 1773 } 1774 1775 as->a_updatedir = 1; /* inform /proc */ 1776 gethrestime(&as->a_updatetime); 1777 1778 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { 1779 AS_LOCK_EXIT(as, &as->a_lock); 1780 1781 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, 1782 RCA_UNSAFE_ALL); 1783 1784 return (ENOMEM); 1785 } 1786 1787 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1788 crargs = *(struct segvn_crargs *)argsp; 1789 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap); 1790 if (error != 0) { 1791 AS_LOCK_EXIT(as, &as->a_lock); 1792 if (unmap) { 1793 (void) as_unmap(as, addr, size); 1794 } 1795 return (error); 1796 } 1797 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1798 crargs = *(struct segvn_crargs *)argsp; 1799 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap); 1800 if (error != 0) { 1801 AS_LOCK_EXIT(as, &as->a_lock); 1802 if (unmap) { 1803 (void) as_unmap(as, addr, size); 1804 } 1805 return (error); 1806 } 1807 } else { 1808 seg = seg_alloc(as, addr, size); 1809 if (seg == NULL) { 1810 AS_LOCK_EXIT(as, &as->a_lock); 1811 return (ENOMEM); 1812 } 1813 1814 error = (*crfp)(seg, argsp); 1815 if (error != 0) { 1816 seg_free(seg); 1817 AS_LOCK_EXIT(as, &as->a_lock); 1818 return (error); 1819 } 1820 /* 1821 * Add size now so as_unmap will work if as_ctl fails. 1822 */ 1823 as->a_size += rsize; 1824 as->a_resvsize += rsize; 1825 } 1826 1827 as_setwatch(as); 1828 1829 /* 1830 * If the address space is locked, 1831 * establish memory locks for the new segment. 1832 */ 1833 mutex_enter(&as->a_contents); 1834 if (AS_ISPGLCK(as)) { 1835 mutex_exit(&as->a_contents); 1836 AS_LOCK_EXIT(as, &as->a_lock); 1837 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1838 if (error != 0) 1839 (void) as_unmap(as, addr, size); 1840 } else { 1841 mutex_exit(&as->a_contents); 1842 AS_LOCK_EXIT(as, &as->a_lock); 1843 } 1844 return (error); 1845 } 1846 1847 1848 /* 1849 * Delete all segments in the address space marked with S_PURGE. 1850 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1851 * These segments are deleted as a first step before calls to as_gap(), so 1852 * that they don't affect mmap() or shmat(). 1853 */ 1854 void 1855 as_purge(struct as *as) 1856 { 1857 struct seg *seg; 1858 struct seg *next_seg; 1859 1860 /* 1861 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1862 * no need to grab a_contents mutex for this check 1863 */ 1864 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1865 return; 1866 1867 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1868 next_seg = NULL; 1869 seg = AS_SEGFIRST(as); 1870 while (seg != NULL) { 1871 next_seg = AS_SEGNEXT(as, seg); 1872 if (seg->s_flags & S_PURGE) 1873 (void) segop_unmap(seg, seg->s_base, seg->s_size); 1874 seg = next_seg; 1875 } 1876 AS_LOCK_EXIT(as, &as->a_lock); 1877 1878 mutex_enter(&as->a_contents); 1879 as->a_flags &= ~AS_NEEDSPURGE; 1880 mutex_exit(&as->a_contents); 1881 } 1882 1883 /* 1884 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1885 * range of addresses at least "minlen" long, where the base of the range is 1886 * at "off" phase from an "align" boundary and there is space for a 1887 * "redzone"-sized redzone on eithe rside of the range. Thus, 1888 * if align was 4M and off was 16k, the user wants a hole which will start 1889 * 16k into a 4M page. 1890 * 1891 * If flags specifies AH_HI, the hole will have the highest possible address 1892 * in the range. We use the as->a_lastgap field to figure out where to 1893 * start looking for a gap. 1894 * 1895 * Otherwise, the gap will have the lowest possible address. 1896 * 1897 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1898 * 1899 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1900 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1901 * 1902 * NOTE: This routine is not correct when base+len overflows caddr_t. 1903 */ 1904 int 1905 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1906 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1907 { 1908 caddr_t lobound = *basep; 1909 caddr_t hibound = lobound + *lenp; 1910 struct seg *lseg, *hseg; 1911 caddr_t lo, hi; 1912 int forward; 1913 caddr_t save_base; 1914 size_t save_len; 1915 size_t save_minlen; 1916 size_t save_redzone; 1917 int fast_path = 1; 1918 1919 save_base = *basep; 1920 save_len = *lenp; 1921 save_minlen = minlen; 1922 save_redzone = redzone; 1923 1924 /* 1925 * For the first pass/fast_path, just add align and redzone into 1926 * minlen since if we get an allocation, we can guarantee that it 1927 * will fit the alignment and redzone requested. 1928 * This increases the chance that hibound will be adjusted to 1929 * a_lastgap->s_base which will likely allow us to find an 1930 * acceptable hole in the address space quicker. 1931 * If we can't find a hole with this fast_path, then we look for 1932 * smaller holes in which the alignment and offset may allow 1933 * the allocation to fit. 1934 */ 1935 minlen += align; 1936 minlen += 2 * redzone; 1937 redzone = 0; 1938 1939 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1940 if (AS_SEGFIRST(as) == NULL) { 1941 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1942 align, redzone, off)) { 1943 AS_LOCK_EXIT(as, &as->a_lock); 1944 return (0); 1945 } else { 1946 AS_LOCK_EXIT(as, &as->a_lock); 1947 *basep = save_base; 1948 *lenp = save_len; 1949 return (-1); 1950 } 1951 } 1952 1953 retry: 1954 /* 1955 * Set up to iterate over all the inter-segment holes in the given 1956 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1957 * NULL for the highest-addressed hole. If moving backwards, we reset 1958 * sseg to denote the highest-addressed segment. 1959 */ 1960 forward = (flags & AH_DIR) == AH_LO; 1961 if (forward) { 1962 hseg = as_findseg(as, lobound, 1); 1963 lseg = AS_SEGPREV(as, hseg); 1964 } else { 1965 1966 /* 1967 * If allocating at least as much as the last allocation, 1968 * use a_lastgap's base as a better estimate of hibound. 1969 */ 1970 if (as->a_lastgap && 1971 minlen >= as->a_lastgap->s_size && 1972 hibound >= as->a_lastgap->s_base) 1973 hibound = as->a_lastgap->s_base; 1974 1975 hseg = as_findseg(as, hibound, 1); 1976 if (hseg->s_base + hseg->s_size < hibound) { 1977 lseg = hseg; 1978 hseg = NULL; 1979 } else { 1980 lseg = AS_SEGPREV(as, hseg); 1981 } 1982 } 1983 1984 for (;;) { 1985 /* 1986 * Set lo and hi to the hole's boundaries. (We should really 1987 * use MAXADDR in place of hibound in the expression below, 1988 * but can't express it easily; using hibound in its place is 1989 * harmless.) 1990 */ 1991 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1992 hi = (hseg == NULL) ? hibound : hseg->s_base; 1993 /* 1994 * If the iteration has moved past the interval from lobound 1995 * to hibound it's pointless to continue. 1996 */ 1997 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1998 break; 1999 else if (lo > hibound || hi < lobound) 2000 goto cont; 2001 /* 2002 * Candidate hole lies at least partially within the allowable 2003 * range. Restrict it to fall completely within that range, 2004 * i.e., to [max(lo, lobound), min(hi, hibound)]. 2005 */ 2006 if (lo < lobound) 2007 lo = lobound; 2008 if (hi > hibound) 2009 hi = hibound; 2010 /* 2011 * Verify that the candidate hole is big enough and meets 2012 * hardware constraints. If the hole is too small, no need 2013 * to do the further checks since they will fail. 2014 */ 2015 *basep = lo; 2016 *lenp = hi - lo; 2017 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 2018 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 2019 ((flags & AH_CONTAIN) == 0 || 2020 (*basep <= addr && *basep + *lenp > addr))) { 2021 if (!forward) 2022 as->a_lastgap = hseg; 2023 if (hseg != NULL) 2024 as->a_lastgaphl = hseg; 2025 else 2026 as->a_lastgaphl = lseg; 2027 AS_LOCK_EXIT(as, &as->a_lock); 2028 return (0); 2029 } 2030 cont: 2031 /* 2032 * Move to the next hole. 2033 */ 2034 if (forward) { 2035 lseg = hseg; 2036 if (lseg == NULL) 2037 break; 2038 hseg = AS_SEGNEXT(as, hseg); 2039 } else { 2040 hseg = lseg; 2041 if (hseg == NULL) 2042 break; 2043 lseg = AS_SEGPREV(as, lseg); 2044 } 2045 } 2046 if (fast_path && (align != 0 || save_redzone != 0)) { 2047 fast_path = 0; 2048 minlen = save_minlen; 2049 redzone = save_redzone; 2050 goto retry; 2051 } 2052 *basep = save_base; 2053 *lenp = save_len; 2054 AS_LOCK_EXIT(as, &as->a_lock); 2055 return (-1); 2056 } 2057 2058 /* 2059 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2060 * 2061 * If flags specifies AH_HI, the hole will have the highest possible address 2062 * in the range. We use the as->a_lastgap field to figure out where to 2063 * start looking for a gap. 2064 * 2065 * Otherwise, the gap will have the lowest possible address. 2066 * 2067 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2068 * 2069 * If an adequate hole is found, base and len are set to reflect the part of 2070 * the hole that is within range, and 0 is returned, otherwise, 2071 * -1 is returned. 2072 * 2073 * NOTE: This routine is not correct when base+len overflows caddr_t. 2074 */ 2075 int 2076 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2077 caddr_t addr) 2078 { 2079 2080 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2081 } 2082 2083 /* 2084 * Return the next range within [base, base + len) that is backed 2085 * with "real memory". Skip holes and non-seg_vn segments. 2086 * We're lazy and only return one segment at a time. 2087 */ 2088 int 2089 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2090 { 2091 extern struct seg_ops segspt_shmops; /* needs a header file */ 2092 struct seg *seg; 2093 caddr_t addr, eaddr; 2094 caddr_t segend; 2095 2096 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2097 2098 addr = *basep; 2099 eaddr = addr + *lenp; 2100 2101 seg = as_findseg(as, addr, 0); 2102 if (seg != NULL) 2103 addr = MAX(seg->s_base, addr); 2104 2105 for (;;) { 2106 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2107 AS_LOCK_EXIT(as, &as->a_lock); 2108 return (EINVAL); 2109 } 2110 2111 if (seg->s_ops == &segvn_ops) { 2112 segend = seg->s_base + seg->s_size; 2113 break; 2114 } 2115 2116 /* 2117 * We do ISM by looking into the private data 2118 * to determine the real size of the segment. 2119 */ 2120 if (seg->s_ops == &segspt_shmops) { 2121 segend = seg->s_base + spt_realsize(seg); 2122 if (addr < segend) 2123 break; 2124 } 2125 2126 seg = AS_SEGNEXT(as, seg); 2127 2128 if (seg != NULL) 2129 addr = seg->s_base; 2130 } 2131 2132 *basep = addr; 2133 2134 if (segend > eaddr) 2135 *lenp = eaddr - addr; 2136 else 2137 *lenp = segend - addr; 2138 2139 AS_LOCK_EXIT(as, &as->a_lock); 2140 return (0); 2141 } 2142 2143 /* 2144 * Swap the pages associated with the address space as out to 2145 * secondary storage, returning the number of bytes actually 2146 * swapped. 2147 * 2148 * The value returned is intended to correlate well with the process's 2149 * memory requirements. Its usefulness for this purpose depends on 2150 * how well the segment-level routines do at returning accurate 2151 * information. 2152 */ 2153 size_t 2154 as_swapout(struct as *as) 2155 { 2156 struct seg *seg; 2157 size_t swpcnt = 0; 2158 2159 /* 2160 * Kernel-only processes have given up their address 2161 * spaces. Of course, we shouldn't be attempting to 2162 * swap out such processes in the first place... 2163 */ 2164 if (as == NULL) 2165 return (0); 2166 2167 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2168 2169 /* Prevent XHATs from attaching */ 2170 mutex_enter(&as->a_contents); 2171 AS_SETBUSY(as); 2172 mutex_exit(&as->a_contents); 2173 2174 2175 /* 2176 * Free all mapping resources associated with the address 2177 * space. The segment-level swapout routines capitalize 2178 * on this unmapping by scavanging pages that have become 2179 * unmapped here. 2180 */ 2181 hat_swapout(as->a_hat); 2182 if (as->a_xhat != NULL) 2183 xhat_swapout_all(as); 2184 2185 mutex_enter(&as->a_contents); 2186 AS_CLRBUSY(as); 2187 mutex_exit(&as->a_contents); 2188 2189 /* 2190 * Call the swapout routines of all segments in the address 2191 * space to do the actual work, accumulating the amount of 2192 * space reclaimed. 2193 */ 2194 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2195 struct seg_ops *ov = seg->s_ops; 2196 2197 /* 2198 * We have to check to see if the seg has 2199 * an ops vector because the seg may have 2200 * been in the middle of being set up when 2201 * the process was picked for swapout. 2202 */ 2203 if ((ov != NULL) && (ov->swapout != NULL)) 2204 swpcnt += segop_swapout(seg); 2205 } 2206 AS_LOCK_EXIT(as, &as->a_lock); 2207 return (swpcnt); 2208 } 2209 2210 /* 2211 * Determine whether data from the mappings in interval [addr, addr + size) 2212 * are in the primary memory (core) cache. 2213 */ 2214 int 2215 as_incore(struct as *as, caddr_t addr, 2216 size_t size, char *vec, size_t *sizep) 2217 { 2218 struct seg *seg; 2219 size_t ssize; 2220 caddr_t raddr; /* rounded down addr */ 2221 size_t rsize; /* rounded up size */ 2222 size_t isize; /* iteration size */ 2223 int error = 0; /* result, assume success */ 2224 2225 *sizep = 0; 2226 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2227 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2228 (size_t)raddr; 2229 2230 if (raddr + rsize < raddr) /* check for wraparound */ 2231 return (ENOMEM); 2232 2233 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2234 seg = as_segat(as, raddr); 2235 if (seg == NULL) { 2236 AS_LOCK_EXIT(as, &as->a_lock); 2237 return (-1); 2238 } 2239 2240 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2241 if (raddr >= seg->s_base + seg->s_size) { 2242 seg = AS_SEGNEXT(as, seg); 2243 if (seg == NULL || raddr != seg->s_base) { 2244 error = -1; 2245 break; 2246 } 2247 } 2248 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2249 ssize = seg->s_base + seg->s_size - raddr; 2250 else 2251 ssize = rsize; 2252 *sizep += isize = segop_incore(seg, raddr, ssize, vec); 2253 if (isize != ssize) { 2254 error = -1; 2255 break; 2256 } 2257 vec += btopr(ssize); 2258 } 2259 AS_LOCK_EXIT(as, &as->a_lock); 2260 return (error); 2261 } 2262 2263 static void 2264 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2265 ulong_t *bitmap, size_t position, size_t npages) 2266 { 2267 caddr_t range_start; 2268 size_t pos1 = position; 2269 size_t pos2; 2270 size_t size; 2271 size_t end_pos = npages + position; 2272 2273 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2274 size = ptob((pos2 - pos1)); 2275 range_start = (caddr_t)((uintptr_t)addr + 2276 ptob(pos1 - position)); 2277 2278 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK, 2279 (ulong_t *)NULL, (size_t)NULL); 2280 pos1 = pos2; 2281 } 2282 } 2283 2284 static void 2285 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2286 caddr_t raddr, size_t rsize) 2287 { 2288 struct seg *seg = as_segat(as, raddr); 2289 size_t ssize; 2290 2291 while (rsize != 0) { 2292 if (raddr >= seg->s_base + seg->s_size) 2293 seg = AS_SEGNEXT(as, seg); 2294 2295 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2296 ssize = seg->s_base + seg->s_size - raddr; 2297 else 2298 ssize = rsize; 2299 2300 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2301 2302 rsize -= ssize; 2303 raddr += ssize; 2304 } 2305 } 2306 2307 /* 2308 * Cache control operations over the interval [addr, addr + size) in 2309 * address space "as". 2310 */ 2311 /*ARGSUSED*/ 2312 int 2313 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2314 uintptr_t arg, ulong_t *lock_map, size_t pos) 2315 { 2316 struct seg *seg; /* working segment */ 2317 caddr_t raddr; /* rounded down addr */ 2318 caddr_t initraddr; /* saved initial rounded down addr */ 2319 size_t rsize; /* rounded up size */ 2320 size_t initrsize; /* saved initial rounded up size */ 2321 size_t ssize; /* size of seg */ 2322 int error = 0; /* result */ 2323 size_t mlock_size; /* size of bitmap */ 2324 ulong_t *mlock_map; /* pointer to bitmap used */ 2325 /* to represent the locked */ 2326 /* pages. */ 2327 retry: 2328 if (error == IE_RETRY) 2329 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2330 else 2331 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2332 2333 /* 2334 * If these are address space lock/unlock operations, loop over 2335 * all segments in the address space, as appropriate. 2336 */ 2337 if (func == MC_LOCKAS) { 2338 size_t npages, idx; 2339 size_t rlen = 0; /* rounded as length */ 2340 2341 idx = pos; 2342 2343 if (arg & MCL_FUTURE) { 2344 mutex_enter(&as->a_contents); 2345 AS_SETPGLCK(as); 2346 mutex_exit(&as->a_contents); 2347 } 2348 if ((arg & MCL_CURRENT) == 0) { 2349 AS_LOCK_EXIT(as, &as->a_lock); 2350 return (0); 2351 } 2352 2353 seg = AS_SEGFIRST(as); 2354 if (seg == NULL) { 2355 AS_LOCK_EXIT(as, &as->a_lock); 2356 return (0); 2357 } 2358 2359 do { 2360 raddr = (caddr_t)((uintptr_t)seg->s_base & 2361 (uintptr_t)PAGEMASK); 2362 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2363 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2364 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2365 2366 mlock_size = BT_BITOUL(btopr(rlen)); 2367 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2368 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2369 AS_LOCK_EXIT(as, &as->a_lock); 2370 return (EAGAIN); 2371 } 2372 2373 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2374 error = segop_lockop(seg, seg->s_base, 2375 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2376 if (error != 0) 2377 break; 2378 pos += seg_pages(seg); 2379 } 2380 2381 if (error) { 2382 for (seg = AS_SEGFIRST(as); seg != NULL; 2383 seg = AS_SEGNEXT(as, seg)) { 2384 2385 raddr = (caddr_t)((uintptr_t)seg->s_base & 2386 (uintptr_t)PAGEMASK); 2387 npages = seg_pages(seg); 2388 as_segunlock(seg, raddr, attr, mlock_map, 2389 idx, npages); 2390 idx += npages; 2391 } 2392 } 2393 2394 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2395 AS_LOCK_EXIT(as, &as->a_lock); 2396 goto lockerr; 2397 } else if (func == MC_UNLOCKAS) { 2398 mutex_enter(&as->a_contents); 2399 AS_CLRPGLCK(as); 2400 mutex_exit(&as->a_contents); 2401 2402 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2403 error = segop_lockop(seg, seg->s_base, 2404 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2405 if (error != 0) 2406 break; 2407 } 2408 2409 AS_LOCK_EXIT(as, &as->a_lock); 2410 goto lockerr; 2411 } 2412 2413 /* 2414 * Normalize addresses and sizes. 2415 */ 2416 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2417 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2418 (size_t)raddr; 2419 2420 if (raddr + rsize < raddr) { /* check for wraparound */ 2421 AS_LOCK_EXIT(as, &as->a_lock); 2422 return (ENOMEM); 2423 } 2424 2425 /* 2426 * Get initial segment. 2427 */ 2428 if ((seg = as_segat(as, raddr)) == NULL) { 2429 AS_LOCK_EXIT(as, &as->a_lock); 2430 return (ENOMEM); 2431 } 2432 2433 if (func == MC_LOCK) { 2434 mlock_size = BT_BITOUL(btopr(rsize)); 2435 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2436 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2437 AS_LOCK_EXIT(as, &as->a_lock); 2438 return (EAGAIN); 2439 } 2440 } 2441 2442 /* 2443 * Loop over all segments. If a hole in the address range is 2444 * discovered, then fail. For each segment, perform the appropriate 2445 * control operation. 2446 */ 2447 while (rsize != 0) { 2448 2449 /* 2450 * Make sure there's no hole, calculate the portion 2451 * of the next segment to be operated over. 2452 */ 2453 if (raddr >= seg->s_base + seg->s_size) { 2454 seg = AS_SEGNEXT(as, seg); 2455 if (seg == NULL || raddr != seg->s_base) { 2456 if (func == MC_LOCK) { 2457 as_unlockerr(as, attr, mlock_map, 2458 initraddr, initrsize - rsize); 2459 kmem_free(mlock_map, 2460 mlock_size * sizeof (ulong_t)); 2461 } 2462 AS_LOCK_EXIT(as, &as->a_lock); 2463 return (ENOMEM); 2464 } 2465 } 2466 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2467 ssize = seg->s_base + seg->s_size - raddr; 2468 else 2469 ssize = rsize; 2470 2471 /* 2472 * Dispatch on specific function. 2473 */ 2474 switch (func) { 2475 2476 /* 2477 * Synchronize cached data from mappings with backing 2478 * objects. 2479 */ 2480 case MC_SYNC: 2481 if (error = segop_sync(seg, raddr, ssize, 2482 attr, (uint_t)arg)) { 2483 AS_LOCK_EXIT(as, &as->a_lock); 2484 return (error); 2485 } 2486 break; 2487 2488 /* 2489 * Lock pages in memory. 2490 */ 2491 case MC_LOCK: 2492 if (error = segop_lockop(seg, raddr, ssize, 2493 attr, func, mlock_map, pos)) { 2494 as_unlockerr(as, attr, mlock_map, initraddr, 2495 initrsize - rsize + ssize); 2496 kmem_free(mlock_map, mlock_size * 2497 sizeof (ulong_t)); 2498 AS_LOCK_EXIT(as, &as->a_lock); 2499 goto lockerr; 2500 } 2501 break; 2502 2503 /* 2504 * Unlock mapped pages. 2505 */ 2506 case MC_UNLOCK: 2507 (void) segop_lockop(seg, raddr, ssize, attr, func, 2508 (ulong_t *)NULL, (size_t)NULL); 2509 break; 2510 2511 /* 2512 * Store VM advise for mapped pages in segment layer. 2513 */ 2514 case MC_ADVISE: 2515 error = segop_advise(seg, raddr, ssize, (uint_t)arg); 2516 2517 /* 2518 * Check for regular errors and special retry error 2519 */ 2520 if (error) { 2521 if (error == IE_RETRY) { 2522 /* 2523 * Need to acquire writers lock, so 2524 * have to drop readers lock and start 2525 * all over again 2526 */ 2527 AS_LOCK_EXIT(as, &as->a_lock); 2528 goto retry; 2529 } else if (error == IE_REATTACH) { 2530 /* 2531 * Find segment for current address 2532 * because current segment just got 2533 * split or concatenated 2534 */ 2535 seg = as_segat(as, raddr); 2536 if (seg == NULL) { 2537 AS_LOCK_EXIT(as, &as->a_lock); 2538 return (ENOMEM); 2539 } 2540 } else { 2541 /* 2542 * Regular error 2543 */ 2544 AS_LOCK_EXIT(as, &as->a_lock); 2545 return (error); 2546 } 2547 } 2548 break; 2549 2550 case MC_INHERIT_ZERO: 2551 error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO); 2552 if (error != 0) { 2553 AS_LOCK_EXIT(as, &as->a_lock); 2554 return (error); 2555 } 2556 break; 2557 2558 /* 2559 * Can't happen. 2560 */ 2561 default: 2562 panic("as_ctl: bad operation %d", func); 2563 /*NOTREACHED*/ 2564 } 2565 2566 rsize -= ssize; 2567 raddr += ssize; 2568 } 2569 2570 if (func == MC_LOCK) 2571 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2572 AS_LOCK_EXIT(as, &as->a_lock); 2573 return (0); 2574 lockerr: 2575 2576 /* 2577 * If the lower levels returned EDEADLK for a segment lockop, 2578 * it means that we should retry the operation. Let's wait 2579 * a bit also to let the deadlock causing condition clear. 2580 * This is part of a gross hack to work around a design flaw 2581 * in the ufs/sds logging code and should go away when the 2582 * logging code is re-designed to fix the problem. See bug 2583 * 4125102 for details of the problem. 2584 */ 2585 if (error == EDEADLK) { 2586 delay(deadlk_wait); 2587 error = 0; 2588 goto retry; 2589 } 2590 return (error); 2591 } 2592 2593 int 2594 fc_decode(faultcode_t fault_err) 2595 { 2596 int error = 0; 2597 2598 switch (FC_CODE(fault_err)) { 2599 case FC_OBJERR: 2600 error = FC_ERRNO(fault_err); 2601 break; 2602 case FC_PROT: 2603 error = EACCES; 2604 break; 2605 default: 2606 error = EFAULT; 2607 break; 2608 } 2609 return (error); 2610 } 2611 2612 /* 2613 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2614 * lists from each segment and copy them to one contiguous shadow list (plist) 2615 * as expected by the caller. Save pointers to per segment shadow lists at 2616 * the tail of plist so that they can be used during as_pageunlock(). 2617 */ 2618 static int 2619 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2620 caddr_t addr, size_t size, enum seg_rw rw) 2621 { 2622 caddr_t sv_addr = addr; 2623 size_t sv_size = size; 2624 struct seg *sv_seg = seg; 2625 ulong_t segcnt = 1; 2626 ulong_t cnt; 2627 size_t ssize; 2628 pgcnt_t npages = btop(size); 2629 page_t **plist; 2630 page_t **pl; 2631 int error; 2632 caddr_t eaddr; 2633 faultcode_t fault_err = 0; 2634 pgcnt_t pl_off; 2635 extern struct seg_ops segspt_shmops; 2636 2637 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2638 ASSERT(seg != NULL); 2639 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2640 ASSERT(addr + size > seg->s_base + seg->s_size); 2641 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2642 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2643 2644 /* 2645 * Count the number of segments covered by the range we are about to 2646 * lock. The segment count is used to size the shadow list we return 2647 * back to the caller. 2648 */ 2649 for (; size != 0; size -= ssize, addr += ssize) { 2650 if (addr >= seg->s_base + seg->s_size) { 2651 2652 seg = AS_SEGNEXT(as, seg); 2653 if (seg == NULL || addr != seg->s_base) { 2654 AS_LOCK_EXIT(as, &as->a_lock); 2655 return (EFAULT); 2656 } 2657 /* 2658 * Do a quick check if subsequent segments 2659 * will most likely support pagelock. 2660 */ 2661 if (seg->s_ops == &segvn_ops) { 2662 vnode_t *vp; 2663 2664 if (segop_getvp(seg, addr, &vp) != 0 || 2665 vp != NULL) { 2666 AS_LOCK_EXIT(as, &as->a_lock); 2667 goto slow; 2668 } 2669 } else if (seg->s_ops != &segspt_shmops) { 2670 AS_LOCK_EXIT(as, &as->a_lock); 2671 goto slow; 2672 } 2673 segcnt++; 2674 } 2675 if (addr + size > seg->s_base + seg->s_size) { 2676 ssize = seg->s_base + seg->s_size - addr; 2677 } else { 2678 ssize = size; 2679 } 2680 } 2681 ASSERT(segcnt > 1); 2682 2683 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2684 2685 addr = sv_addr; 2686 size = sv_size; 2687 seg = sv_seg; 2688 2689 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2690 if (addr >= seg->s_base + seg->s_size) { 2691 seg = AS_SEGNEXT(as, seg); 2692 ASSERT(seg != NULL && addr == seg->s_base); 2693 cnt++; 2694 ASSERT(cnt < segcnt); 2695 } 2696 if (addr + size > seg->s_base + seg->s_size) { 2697 ssize = seg->s_base + seg->s_size - addr; 2698 } else { 2699 ssize = size; 2700 } 2701 pl = &plist[npages + cnt]; 2702 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl, 2703 L_PAGELOCK, rw); 2704 if (error) { 2705 break; 2706 } 2707 ASSERT(plist[npages + cnt] != NULL); 2708 ASSERT(pl_off + btop(ssize) <= npages); 2709 bcopy(plist[npages + cnt], &plist[pl_off], 2710 btop(ssize) * sizeof (page_t *)); 2711 pl_off += btop(ssize); 2712 } 2713 2714 if (size == 0) { 2715 AS_LOCK_EXIT(as, &as->a_lock); 2716 ASSERT(cnt == segcnt - 1); 2717 *ppp = plist; 2718 return (0); 2719 } 2720 2721 /* 2722 * one of pagelock calls failed. The error type is in error variable. 2723 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2724 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2725 * back to the caller. 2726 */ 2727 2728 eaddr = addr; 2729 seg = sv_seg; 2730 2731 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2732 if (addr >= seg->s_base + seg->s_size) { 2733 seg = AS_SEGNEXT(as, seg); 2734 ASSERT(seg != NULL && addr == seg->s_base); 2735 cnt++; 2736 ASSERT(cnt < segcnt); 2737 } 2738 if (eaddr > seg->s_base + seg->s_size) { 2739 ssize = seg->s_base + seg->s_size - addr; 2740 } else { 2741 ssize = eaddr - addr; 2742 } 2743 pl = &plist[npages + cnt]; 2744 ASSERT(*pl != NULL); 2745 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl, 2746 L_PAGEUNLOCK, rw); 2747 } 2748 2749 AS_LOCK_EXIT(as, &as->a_lock); 2750 2751 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2752 2753 if (error != ENOTSUP && error != EFAULT) { 2754 return (error); 2755 } 2756 2757 slow: 2758 /* 2759 * If we are here because pagelock failed due to the need to cow fault 2760 * in the pages we want to lock F_SOFTLOCK will do this job and in 2761 * next as_pagelock() call for this address range pagelock will 2762 * hopefully succeed. 2763 */ 2764 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2765 if (fault_err != 0) { 2766 return (fc_decode(fault_err)); 2767 } 2768 *ppp = NULL; 2769 2770 return (0); 2771 } 2772 2773 /* 2774 * lock pages in a given address space. Return shadow list. If 2775 * the list is NULL, the MMU mapping is also locked. 2776 */ 2777 int 2778 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2779 size_t size, enum seg_rw rw) 2780 { 2781 size_t rsize; 2782 caddr_t raddr; 2783 faultcode_t fault_err; 2784 struct seg *seg; 2785 int err; 2786 2787 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2788 "as_pagelock_start: addr %p size %ld", addr, size); 2789 2790 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2791 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2792 (size_t)raddr; 2793 2794 /* 2795 * if the request crosses two segments let 2796 * as_fault handle it. 2797 */ 2798 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2799 2800 seg = as_segat(as, raddr); 2801 if (seg == NULL) { 2802 AS_LOCK_EXIT(as, &as->a_lock); 2803 return (EFAULT); 2804 } 2805 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2806 if (raddr + rsize > seg->s_base + seg->s_size) { 2807 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2808 } 2809 if (raddr + rsize <= raddr) { 2810 AS_LOCK_EXIT(as, &as->a_lock); 2811 return (EFAULT); 2812 } 2813 2814 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2815 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2816 2817 /* 2818 * try to lock pages and pass back shadow list 2819 */ 2820 err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2821 2822 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2823 2824 AS_LOCK_EXIT(as, &as->a_lock); 2825 2826 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2827 return (err); 2828 } 2829 2830 /* 2831 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2832 * to no pagelock support for this segment or pages need to be cow 2833 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2834 * this as_pagelock() call and in the next as_pagelock() call for the 2835 * same address range pagelock call will hopefull succeed. 2836 */ 2837 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2838 if (fault_err != 0) { 2839 return (fc_decode(fault_err)); 2840 } 2841 *ppp = NULL; 2842 2843 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2844 return (0); 2845 } 2846 2847 /* 2848 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2849 * lists from the end of plist and call pageunlock interface for each segment. 2850 * Drop as lock and free plist. 2851 */ 2852 static void 2853 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2854 struct page **plist, enum seg_rw rw) 2855 { 2856 ulong_t cnt; 2857 caddr_t eaddr = addr + size; 2858 pgcnt_t npages = btop(size); 2859 size_t ssize; 2860 page_t **pl; 2861 2862 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2863 ASSERT(seg != NULL); 2864 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2865 ASSERT(addr + size > seg->s_base + seg->s_size); 2866 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2867 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2868 ASSERT(plist != NULL); 2869 2870 for (cnt = 0; addr < eaddr; addr += ssize) { 2871 if (addr >= seg->s_base + seg->s_size) { 2872 seg = AS_SEGNEXT(as, seg); 2873 ASSERT(seg != NULL && addr == seg->s_base); 2874 cnt++; 2875 } 2876 if (eaddr > seg->s_base + seg->s_size) { 2877 ssize = seg->s_base + seg->s_size - addr; 2878 } else { 2879 ssize = eaddr - addr; 2880 } 2881 pl = &plist[npages + cnt]; 2882 ASSERT(*pl != NULL); 2883 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl, 2884 L_PAGEUNLOCK, rw); 2885 } 2886 ASSERT(cnt > 0); 2887 AS_LOCK_EXIT(as, &as->a_lock); 2888 2889 cnt++; 2890 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2891 } 2892 2893 /* 2894 * unlock pages in a given address range 2895 */ 2896 void 2897 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2898 enum seg_rw rw) 2899 { 2900 struct seg *seg; 2901 size_t rsize; 2902 caddr_t raddr; 2903 2904 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2905 "as_pageunlock_start: addr %p size %ld", addr, size); 2906 2907 /* 2908 * if the shadow list is NULL, as_pagelock was 2909 * falling back to as_fault 2910 */ 2911 if (pp == NULL) { 2912 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2913 return; 2914 } 2915 2916 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2917 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2918 (size_t)raddr; 2919 2920 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2921 seg = as_segat(as, raddr); 2922 ASSERT(seg != NULL); 2923 2924 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2925 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2926 2927 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2928 if (raddr + rsize <= seg->s_base + seg->s_size) { 2929 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2930 } else { 2931 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2932 return; 2933 } 2934 AS_LOCK_EXIT(as, &as->a_lock); 2935 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2936 } 2937 2938 int 2939 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2940 boolean_t wait) 2941 { 2942 struct seg *seg; 2943 size_t ssize; 2944 caddr_t raddr; /* rounded down addr */ 2945 size_t rsize; /* rounded up size */ 2946 int error = 0; 2947 size_t pgsz = page_get_pagesize(szc); 2948 2949 setpgsz_top: 2950 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2951 return (EINVAL); 2952 } 2953 2954 raddr = addr; 2955 rsize = size; 2956 2957 if (raddr + rsize < raddr) /* check for wraparound */ 2958 return (ENOMEM); 2959 2960 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2961 as_clearwatchprot(as, raddr, rsize); 2962 seg = as_segat(as, raddr); 2963 if (seg == NULL) { 2964 as_setwatch(as); 2965 AS_LOCK_EXIT(as, &as->a_lock); 2966 return (ENOMEM); 2967 } 2968 2969 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2970 if (raddr >= seg->s_base + seg->s_size) { 2971 seg = AS_SEGNEXT(as, seg); 2972 if (seg == NULL || raddr != seg->s_base) { 2973 error = ENOMEM; 2974 break; 2975 } 2976 } 2977 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2978 ssize = seg->s_base + seg->s_size - raddr; 2979 } else { 2980 ssize = rsize; 2981 } 2982 2983 retry: 2984 error = segop_setpagesize(seg, raddr, ssize, szc); 2985 2986 if (error == IE_NOMEM) { 2987 error = EAGAIN; 2988 break; 2989 } 2990 2991 if (error == IE_RETRY) { 2992 AS_LOCK_EXIT(as, &as->a_lock); 2993 goto setpgsz_top; 2994 } 2995 2996 if (error == ENOTSUP) { 2997 error = EINVAL; 2998 break; 2999 } 3000 3001 if (wait && (error == EAGAIN)) { 3002 /* 3003 * Memory is currently locked. It must be unlocked 3004 * before this operation can succeed through a retry. 3005 * The possible reasons for locked memory and 3006 * corresponding strategies for unlocking are: 3007 * (1) Normal I/O 3008 * wait for a signal that the I/O operation 3009 * has completed and the memory is unlocked. 3010 * (2) Asynchronous I/O 3011 * The aio subsystem does not unlock pages when 3012 * the I/O is completed. Those pages are unlocked 3013 * when the application calls aiowait/aioerror. 3014 * So, to prevent blocking forever, cv_broadcast() 3015 * is done to wake up aio_cleanup_thread. 3016 * Subsequently, segvn_reclaim will be called, and 3017 * that will do AS_CLRUNMAPWAIT() and wake us up. 3018 * (3) Long term page locking: 3019 * This is not relevant for as_setpagesize() 3020 * because we cannot change the page size for 3021 * driver memory. The attempt to do so will 3022 * fail with a different error than EAGAIN so 3023 * there's no need to trigger as callbacks like 3024 * as_unmap, as_setprot or as_free would do. 3025 */ 3026 mutex_enter(&as->a_contents); 3027 if (!AS_ISNOUNMAPWAIT(as)) { 3028 if (AS_ISUNMAPWAIT(as) == 0) { 3029 cv_broadcast(&as->a_cv); 3030 } 3031 AS_SETUNMAPWAIT(as); 3032 AS_LOCK_EXIT(as, &as->a_lock); 3033 while (AS_ISUNMAPWAIT(as)) { 3034 cv_wait(&as->a_cv, &as->a_contents); 3035 } 3036 } else { 3037 /* 3038 * We may have raced with 3039 * segvn_reclaim()/segspt_reclaim(). In this 3040 * case clean nounmapwait flag and retry since 3041 * softlockcnt in this segment may be already 3042 * 0. We don't drop as writer lock so our 3043 * number of retries without sleeping should 3044 * be very small. See segvn_reclaim() for 3045 * more comments. 3046 */ 3047 AS_CLRNOUNMAPWAIT(as); 3048 mutex_exit(&as->a_contents); 3049 goto retry; 3050 } 3051 mutex_exit(&as->a_contents); 3052 goto setpgsz_top; 3053 } else if (error != 0) { 3054 break; 3055 } 3056 } 3057 as_setwatch(as); 3058 AS_LOCK_EXIT(as, &as->a_lock); 3059 return (error); 3060 } 3061 3062 /* 3063 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments 3064 * in its chunk where s_szc is less than the szc we want to set. 3065 */ 3066 static int 3067 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3068 int *retry) 3069 { 3070 struct seg *seg; 3071 size_t ssize; 3072 int error; 3073 3074 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3075 3076 seg = as_segat(as, raddr); 3077 if (seg == NULL) { 3078 panic("as_iset3_default_lpsize: no seg"); 3079 } 3080 3081 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3082 if (raddr >= seg->s_base + seg->s_size) { 3083 seg = AS_SEGNEXT(as, seg); 3084 if (seg == NULL || raddr != seg->s_base) { 3085 panic("as_iset3_default_lpsize: as changed"); 3086 } 3087 } 3088 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3089 ssize = seg->s_base + seg->s_size - raddr; 3090 } else { 3091 ssize = rsize; 3092 } 3093 3094 if (szc > seg->s_szc) { 3095 error = segop_setpagesize(seg, raddr, ssize, szc); 3096 /* Only retry on EINVAL segments that have no vnode. */ 3097 if (error == EINVAL) { 3098 vnode_t *vp = NULL; 3099 if ((segop_gettype(seg, raddr) & MAP_SHARED) && 3100 (segop_getvp(seg, raddr, &vp) != 0 || 3101 vp == NULL)) { 3102 *retry = 1; 3103 } else { 3104 *retry = 0; 3105 } 3106 } 3107 if (error) { 3108 return (error); 3109 } 3110 } 3111 } 3112 return (0); 3113 } 3114 3115 /* 3116 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3117 * pagesize on each segment in its range, but if any fails with EINVAL, 3118 * then it reduces the pagesizes to the next size in the bitmap and 3119 * retries as_iset3_default_lpsize(). The reason why the code retries 3120 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3121 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3122 * with) to pass to map_pgszcvec(). 3123 */ 3124 static int 3125 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3126 uint_t szcvec) 3127 { 3128 int error; 3129 int retry; 3130 3131 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3132 3133 for (;;) { 3134 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3135 if (error == EINVAL && retry) { 3136 szcvec &= ~(1 << szc); 3137 if (szcvec <= 1) { 3138 return (EINVAL); 3139 } 3140 szc = highbit(szcvec) - 1; 3141 } else { 3142 return (error); 3143 } 3144 } 3145 } 3146 3147 /* 3148 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3149 * segments have a smaller szc than we want to set. For each such area, 3150 * it calls as_iset2_default_lpsize() 3151 */ 3152 static int 3153 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3154 uint_t szcvec) 3155 { 3156 struct seg *seg; 3157 size_t ssize; 3158 caddr_t setaddr = raddr; 3159 size_t setsize = 0; 3160 int set; 3161 int error; 3162 3163 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3164 3165 seg = as_segat(as, raddr); 3166 if (seg == NULL) { 3167 panic("as_iset1_default_lpsize: no seg"); 3168 } 3169 if (seg->s_szc < szc) { 3170 set = 1; 3171 } else { 3172 set = 0; 3173 } 3174 3175 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3176 if (raddr >= seg->s_base + seg->s_size) { 3177 seg = AS_SEGNEXT(as, seg); 3178 if (seg == NULL || raddr != seg->s_base) { 3179 panic("as_iset1_default_lpsize: as changed"); 3180 } 3181 if (seg->s_szc >= szc && set) { 3182 ASSERT(setsize != 0); 3183 error = as_iset2_default_lpsize(as, 3184 setaddr, setsize, szc, szcvec); 3185 if (error) { 3186 return (error); 3187 } 3188 set = 0; 3189 } else if (seg->s_szc < szc && !set) { 3190 setaddr = raddr; 3191 setsize = 0; 3192 set = 1; 3193 } 3194 } 3195 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3196 ssize = seg->s_base + seg->s_size - raddr; 3197 } else { 3198 ssize = rsize; 3199 } 3200 } 3201 error = 0; 3202 if (set) { 3203 ASSERT(setsize != 0); 3204 error = as_iset2_default_lpsize(as, setaddr, setsize, 3205 szc, szcvec); 3206 } 3207 return (error); 3208 } 3209 3210 /* 3211 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3212 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3213 * chunk to as_iset1_default_lpsize(). 3214 */ 3215 static int 3216 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3217 int type) 3218 { 3219 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3220 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3221 flags, rtype, 1); 3222 uint_t szc; 3223 uint_t nszc; 3224 int error; 3225 caddr_t a; 3226 caddr_t eaddr; 3227 size_t segsize; 3228 size_t pgsz; 3229 uint_t save_szcvec; 3230 3231 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3232 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3233 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3234 3235 szcvec &= ~1; 3236 if (szcvec <= 1) { /* skip if base page size */ 3237 return (0); 3238 } 3239 3240 /* Get the pagesize of the first larger page size. */ 3241 szc = lowbit(szcvec) - 1; 3242 pgsz = page_get_pagesize(szc); 3243 eaddr = addr + size; 3244 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3245 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3246 3247 save_szcvec = szcvec; 3248 szcvec >>= (szc + 1); 3249 nszc = szc; 3250 while (szcvec) { 3251 if ((szcvec & 0x1) == 0) { 3252 nszc++; 3253 szcvec >>= 1; 3254 continue; 3255 } 3256 nszc++; 3257 pgsz = page_get_pagesize(nszc); 3258 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3259 if (a != addr) { 3260 ASSERT(szc > 0); 3261 ASSERT(a < eaddr); 3262 segsize = a - addr; 3263 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3264 save_szcvec); 3265 if (error) { 3266 return (error); 3267 } 3268 addr = a; 3269 } 3270 szc = nszc; 3271 szcvec >>= 1; 3272 } 3273 3274 ASSERT(addr < eaddr); 3275 szcvec = save_szcvec; 3276 while (szcvec) { 3277 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3278 ASSERT(a >= addr); 3279 if (a != addr) { 3280 ASSERT(szc > 0); 3281 segsize = a - addr; 3282 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3283 save_szcvec); 3284 if (error) { 3285 return (error); 3286 } 3287 addr = a; 3288 } 3289 szcvec &= ~(1 << szc); 3290 if (szcvec) { 3291 szc = highbit(szcvec) - 1; 3292 pgsz = page_get_pagesize(szc); 3293 } 3294 } 3295 ASSERT(addr == eaddr); 3296 3297 return (0); 3298 } 3299 3300 /* 3301 * Set the default large page size for the range. Called via memcntl with 3302 * page size set to 0. as_set_default_lpsize breaks the range down into 3303 * chunks with the same type/flags, ignores-non segvn segments, and passes 3304 * each chunk to as_iset_default_lpsize(). 3305 */ 3306 int 3307 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3308 { 3309 struct seg *seg; 3310 caddr_t raddr; 3311 size_t rsize; 3312 size_t ssize; 3313 int rtype, rflags; 3314 int stype, sflags; 3315 int error; 3316 caddr_t setaddr; 3317 size_t setsize; 3318 int segvn; 3319 3320 if (size == 0) 3321 return (0); 3322 3323 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3324 again: 3325 error = 0; 3326 3327 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3328 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3329 (size_t)raddr; 3330 3331 if (raddr + rsize < raddr) { /* check for wraparound */ 3332 AS_LOCK_EXIT(as, &as->a_lock); 3333 return (ENOMEM); 3334 } 3335 as_clearwatchprot(as, raddr, rsize); 3336 seg = as_segat(as, raddr); 3337 if (seg == NULL) { 3338 as_setwatch(as); 3339 AS_LOCK_EXIT(as, &as->a_lock); 3340 return (ENOMEM); 3341 } 3342 if (seg->s_ops == &segvn_ops) { 3343 rtype = segop_gettype(seg, addr); 3344 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3345 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3346 segvn = 1; 3347 } else { 3348 segvn = 0; 3349 } 3350 setaddr = raddr; 3351 setsize = 0; 3352 3353 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3354 if (raddr >= (seg->s_base + seg->s_size)) { 3355 seg = AS_SEGNEXT(as, seg); 3356 if (seg == NULL || raddr != seg->s_base) { 3357 error = ENOMEM; 3358 break; 3359 } 3360 if (seg->s_ops == &segvn_ops) { 3361 stype = segop_gettype(seg, raddr); 3362 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3363 stype &= (MAP_SHARED | MAP_PRIVATE); 3364 if (segvn && (rflags != sflags || 3365 rtype != stype)) { 3366 /* 3367 * The next segment is also segvn but 3368 * has different flags and/or type. 3369 */ 3370 ASSERT(setsize != 0); 3371 error = as_iset_default_lpsize(as, 3372 setaddr, setsize, rflags, rtype); 3373 if (error) { 3374 break; 3375 } 3376 rflags = sflags; 3377 rtype = stype; 3378 setaddr = raddr; 3379 setsize = 0; 3380 } else if (!segvn) { 3381 rflags = sflags; 3382 rtype = stype; 3383 setaddr = raddr; 3384 setsize = 0; 3385 segvn = 1; 3386 } 3387 } else if (segvn) { 3388 /* The next segment is not segvn. */ 3389 ASSERT(setsize != 0); 3390 error = as_iset_default_lpsize(as, 3391 setaddr, setsize, rflags, rtype); 3392 if (error) { 3393 break; 3394 } 3395 segvn = 0; 3396 } 3397 } 3398 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3399 ssize = seg->s_base + seg->s_size - raddr; 3400 } else { 3401 ssize = rsize; 3402 } 3403 } 3404 if (error == 0 && segvn) { 3405 /* The last chunk when rsize == 0. */ 3406 ASSERT(setsize != 0); 3407 error = as_iset_default_lpsize(as, setaddr, setsize, 3408 rflags, rtype); 3409 } 3410 3411 if (error == IE_RETRY) { 3412 goto again; 3413 } else if (error == IE_NOMEM) { 3414 error = EAGAIN; 3415 } else if (error == ENOTSUP) { 3416 error = EINVAL; 3417 } else if (error == EAGAIN) { 3418 mutex_enter(&as->a_contents); 3419 if (!AS_ISNOUNMAPWAIT(as)) { 3420 if (AS_ISUNMAPWAIT(as) == 0) { 3421 cv_broadcast(&as->a_cv); 3422 } 3423 AS_SETUNMAPWAIT(as); 3424 AS_LOCK_EXIT(as, &as->a_lock); 3425 while (AS_ISUNMAPWAIT(as)) { 3426 cv_wait(&as->a_cv, &as->a_contents); 3427 } 3428 mutex_exit(&as->a_contents); 3429 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3430 } else { 3431 /* 3432 * We may have raced with 3433 * segvn_reclaim()/segspt_reclaim(). In this case 3434 * clean nounmapwait flag and retry since softlockcnt 3435 * in this segment may be already 0. We don't drop as 3436 * writer lock so our number of retries without 3437 * sleeping should be very small. See segvn_reclaim() 3438 * for more comments. 3439 */ 3440 AS_CLRNOUNMAPWAIT(as); 3441 mutex_exit(&as->a_contents); 3442 } 3443 goto again; 3444 } 3445 3446 as_setwatch(as); 3447 AS_LOCK_EXIT(as, &as->a_lock); 3448 return (error); 3449 } 3450 3451 /* 3452 * Setup all of the uninitialized watched pages that we can. 3453 */ 3454 void 3455 as_setwatch(struct as *as) 3456 { 3457 struct watched_page *pwp; 3458 struct seg *seg; 3459 caddr_t vaddr; 3460 uint_t prot; 3461 int err, retrycnt; 3462 3463 if (avl_numnodes(&as->a_wpage) == 0) 3464 return; 3465 3466 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3467 3468 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3469 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3470 retrycnt = 0; 3471 retry: 3472 vaddr = pwp->wp_vaddr; 3473 if (pwp->wp_oprot != 0 || /* already set up */ 3474 (seg = as_segat(as, vaddr)) == NULL || 3475 segop_getprot(seg, vaddr, 0, &prot) != 0) 3476 continue; 3477 3478 pwp->wp_oprot = prot; 3479 if (pwp->wp_read) 3480 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3481 if (pwp->wp_write) 3482 prot &= ~PROT_WRITE; 3483 if (pwp->wp_exec) 3484 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3485 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3486 err = segop_setprot(seg, vaddr, PAGESIZE, prot); 3487 if (err == IE_RETRY) { 3488 pwp->wp_oprot = 0; 3489 ASSERT(retrycnt == 0); 3490 retrycnt++; 3491 goto retry; 3492 } 3493 } 3494 pwp->wp_prot = prot; 3495 } 3496 } 3497 3498 /* 3499 * Clear all of the watched pages in the address space. 3500 */ 3501 void 3502 as_clearwatch(struct as *as) 3503 { 3504 struct watched_page *pwp; 3505 struct seg *seg; 3506 caddr_t vaddr; 3507 uint_t prot; 3508 int err, retrycnt; 3509 3510 if (avl_numnodes(&as->a_wpage) == 0) 3511 return; 3512 3513 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3514 3515 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3516 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3517 retrycnt = 0; 3518 retry: 3519 vaddr = pwp->wp_vaddr; 3520 if (pwp->wp_oprot == 0 || /* not set up */ 3521 (seg = as_segat(as, vaddr)) == NULL) 3522 continue; 3523 3524 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3525 err = segop_setprot(seg, vaddr, PAGESIZE, prot); 3526 if (err == IE_RETRY) { 3527 ASSERT(retrycnt == 0); 3528 retrycnt++; 3529 goto retry; 3530 } 3531 } 3532 pwp->wp_oprot = 0; 3533 pwp->wp_prot = 0; 3534 } 3535 } 3536 3537 /* 3538 * Force a new setup for all the watched pages in the range. 3539 */ 3540 static void 3541 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3542 { 3543 struct watched_page *pwp; 3544 struct watched_page tpw; 3545 caddr_t eaddr = addr + size; 3546 caddr_t vaddr; 3547 struct seg *seg; 3548 int err, retrycnt; 3549 uint_t wprot; 3550 avl_index_t where; 3551 3552 if (avl_numnodes(&as->a_wpage) == 0) 3553 return; 3554 3555 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3556 3557 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3558 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3559 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3560 3561 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3562 retrycnt = 0; 3563 vaddr = pwp->wp_vaddr; 3564 3565 wprot = prot; 3566 if (pwp->wp_read) 3567 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3568 if (pwp->wp_write) 3569 wprot &= ~PROT_WRITE; 3570 if (pwp->wp_exec) 3571 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3572 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3573 retry: 3574 seg = as_segat(as, vaddr); 3575 if (seg == NULL) { 3576 panic("as_setwatchprot: no seg"); 3577 /*NOTREACHED*/ 3578 } 3579 err = segop_setprot(seg, vaddr, PAGESIZE, wprot); 3580 if (err == IE_RETRY) { 3581 ASSERT(retrycnt == 0); 3582 retrycnt++; 3583 goto retry; 3584 } 3585 } 3586 pwp->wp_oprot = prot; 3587 pwp->wp_prot = wprot; 3588 3589 pwp = AVL_NEXT(&as->a_wpage, pwp); 3590 } 3591 } 3592 3593 /* 3594 * Clear all of the watched pages in the range. 3595 */ 3596 static void 3597 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3598 { 3599 caddr_t eaddr = addr + size; 3600 struct watched_page *pwp; 3601 struct watched_page tpw; 3602 uint_t prot; 3603 struct seg *seg; 3604 int err, retrycnt; 3605 avl_index_t where; 3606 3607 if (avl_numnodes(&as->a_wpage) == 0) 3608 return; 3609 3610 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3611 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3612 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3613 3614 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3615 3616 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3617 3618 if ((prot = pwp->wp_oprot) != 0) { 3619 retrycnt = 0; 3620 3621 if (prot != pwp->wp_prot) { 3622 retry: 3623 seg = as_segat(as, pwp->wp_vaddr); 3624 if (seg == NULL) 3625 continue; 3626 err = segop_setprot(seg, pwp->wp_vaddr, 3627 PAGESIZE, prot); 3628 if (err == IE_RETRY) { 3629 ASSERT(retrycnt == 0); 3630 retrycnt++; 3631 goto retry; 3632 3633 } 3634 } 3635 pwp->wp_oprot = 0; 3636 pwp->wp_prot = 0; 3637 } 3638 3639 pwp = AVL_NEXT(&as->a_wpage, pwp); 3640 } 3641 } 3642 3643 void 3644 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3645 { 3646 struct proc *p; 3647 3648 mutex_enter(&pidlock); 3649 for (p = practive; p; p = p->p_next) { 3650 if (p->p_as == as) { 3651 mutex_enter(&p->p_lock); 3652 if (p->p_as == as) 3653 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3654 mutex_exit(&p->p_lock); 3655 } 3656 } 3657 mutex_exit(&pidlock); 3658 } 3659 3660 /* 3661 * return memory object ID 3662 */ 3663 int 3664 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3665 { 3666 struct seg *seg; 3667 int sts; 3668 3669 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 3670 seg = as_segat(as, addr); 3671 if (seg == NULL) { 3672 AS_LOCK_EXIT(as, &as->a_lock); 3673 return (EFAULT); 3674 } 3675 /* 3676 * catch old drivers which may not support getmemid 3677 */ 3678 if (seg->s_ops->getmemid == NULL) { 3679 AS_LOCK_EXIT(as, &as->a_lock); 3680 return (ENODEV); 3681 } 3682 3683 sts = segop_getmemid(seg, addr, memidp); 3684 3685 AS_LOCK_EXIT(as, &as->a_lock); 3686 return (sts); 3687 }