343 return (0);
344 }
345
346 /*
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
352 * list is returned.
353 *
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
356 */
357 struct seg *
358 as_findseg(struct as *as, caddr_t addr, int tail)
359 {
360 struct seg *seg = as->a_seglast;
361 avl_index_t where;
362
363 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
364
365 if (seg != NULL &&
366 seg->s_base <= addr &&
367 addr < seg->s_base + seg->s_size)
368 return (seg);
369
370 seg = avl_find(&as->a_segtree, &addr, &where);
371 if (seg != NULL)
372 return (as->a_seglast = seg);
373
374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 if (seg == NULL && tail)
376 seg = avl_last(&as->a_segtree);
377 return (as->a_seglast = seg);
378 }
379
380 #ifdef VERIFY_SEGLIST
381 /*
382 * verify that the linked list is coherent
383 */
405 nsegs++;
406 }
407 ASSERT(seglast == NULL);
408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
409 }
410 #endif /* VERIFY_SEGLIST */
411
412 /*
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
416 */
417 int
418 as_addseg(struct as *as, struct seg *newseg)
419 {
420 struct seg *seg;
421 caddr_t addr;
422 caddr_t eaddr;
423 avl_index_t where;
424
425 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
426
427 as->a_updatedir = 1; /* inform /proc */
428 gethrestime(&as->a_updatetime);
429
430 if (as->a_lastgaphl != NULL) {
431 struct seg *hseg = NULL;
432 struct seg *lseg = NULL;
433
434 if (as->a_lastgaphl->s_base > newseg->s_base) {
435 hseg = as->a_lastgaphl;
436 lseg = AVL_PREV(&as->a_segtree, hseg);
437 } else {
438 lseg = as->a_lastgaphl;
439 hseg = AVL_NEXT(&as->a_segtree, lseg);
440 }
441
442 if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 hseg->s_base > newseg->s_base) {
444 avl_insert_here(&as->a_segtree, newseg, lseg,
445 AVL_AFTER);
487 }
488 #endif
489 return (-1); /* overlapping segment */
490 }
491 }
492 }
493 as->a_seglast = newseg;
494 avl_insert(&as->a_segtree, newseg, where);
495
496 #ifdef VERIFY_SEGLIST
497 as_verify(as);
498 #endif
499 return (0);
500 }
501
502 struct seg *
503 as_removeseg(struct as *as, struct seg *seg)
504 {
505 avl_tree_t *t;
506
507 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
508
509 as->a_updatedir = 1; /* inform /proc */
510 gethrestime(&as->a_updatetime);
511
512 if (seg == NULL)
513 return (NULL);
514
515 t = &as->a_segtree;
516 if (as->a_seglast == seg)
517 as->a_seglast = NULL;
518 as->a_lastgaphl = NULL;
519
520 /*
521 * if this segment is at an address higher than
522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
523 */
524 if (as->a_lastgap &&
525 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
526 as->a_lastgap = AVL_NEXT(t, seg);
527
528 /*
529 * remove the segment from the seg tree
530 */
531 avl_remove(t, seg);
532
533 #ifdef VERIFY_SEGLIST
534 as_verify(as);
535 #endif
536 return (seg);
537 }
538
539 /*
540 * Find a segment containing addr.
541 */
542 struct seg *
543 as_segat(struct as *as, caddr_t addr)
544 {
545 struct seg *seg = as->a_seglast;
546
547 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
548
549 if (seg != NULL && seg->s_base <= addr &&
550 addr < seg->s_base + seg->s_size)
551 return (seg);
552
553 seg = avl_find(&as->a_segtree, &addr, NULL);
554 return (seg);
555 }
556
557 /*
558 * Serialize all searches for holes in an address space to
559 * prevent two or more threads from allocating the same virtual
560 * address range. The address space must not be "read/write"
561 * locked by the caller since we may block.
562 */
563 void
564 as_rangelock(struct as *as)
565 {
566 mutex_enter(&as->a_contents);
567 while (AS_ISCLAIMGAP(as))
650 {
651 struct as *as;
652
653 as = kmem_cache_alloc(as_cache, KM_SLEEP);
654
655 as->a_flags = 0;
656 as->a_vbits = 0;
657 as->a_hrm = NULL;
658 as->a_seglast = NULL;
659 as->a_size = 0;
660 as->a_resvsize = 0;
661 as->a_updatedir = 0;
662 gethrestime(&as->a_updatetime);
663 as->a_objectdir = NULL;
664 as->a_sizedir = 0;
665 as->a_userlimit = (caddr_t)USERLIMIT;
666 as->a_lastgap = NULL;
667 as->a_lastgaphl = NULL;
668 as->a_callbacks = NULL;
669
670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
672 AS_LOCK_EXIT(as, &as->a_lock);
673
674 as->a_xhat = NULL;
675
676 return (as);
677 }
678
679 /*
680 * Free an address space data structure.
681 * Need to free the hat first and then
682 * all the segments on this as and finally
683 * the space for the as struct itself.
684 */
685 void
686 as_free(struct as *as)
687 {
688 struct hat *hat = as->a_hat;
689 struct seg *seg, *next;
690 int called = 0;
691
692 top:
693 /*
694 * Invoke ALL callbacks. as_do_callbacks will do one callback
695 * per call, and not return (-1) until the callback has completed.
696 * When as_do_callbacks returns zero, all callbacks have completed.
697 */
698 mutex_enter(&as->a_contents);
699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
700 ;
701
702 /* This will prevent new XHATs from attaching to as */
703 if (!called)
704 AS_SETBUSY(as);
705 mutex_exit(&as->a_contents);
706 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
707
708 if (!called) {
709 called = 1;
710 hat_free_start(hat);
711 if (as->a_xhat != NULL)
712 xhat_free_start_all(as);
713 }
714 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
715 int err;
716
717 next = AS_SEGNEXT(as, seg);
718 retry:
719 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
720 if (err == EAGAIN) {
721 mutex_enter(&as->a_contents);
722 if (as->a_callbacks) {
723 AS_LOCK_EXIT(as, &as->a_lock);
724 } else if (!AS_ISNOUNMAPWAIT(as)) {
725 /*
726 * Memory is currently locked. Wait for a
727 * cv_signal that it has been unlocked, then
728 * try the operation again.
729 */
730 if (AS_ISUNMAPWAIT(as) == 0)
731 cv_broadcast(&as->a_cv);
732 AS_SETUNMAPWAIT(as);
733 AS_LOCK_EXIT(as, &as->a_lock);
734 while (AS_ISUNMAPWAIT(as))
735 cv_wait(&as->a_cv, &as->a_contents);
736 } else {
737 /*
738 * We may have raced with
739 * segvn_reclaim()/segspt_reclaim(). In this
740 * case clean nounmapwait flag and retry since
741 * softlockcnt in this segment may be already
742 * 0. We don't drop as writer lock so our
743 * number of retries without sleeping should
744 * be very small. See segvn_reclaim() for
745 * more comments.
746 */
747 AS_CLRNOUNMAPWAIT(as);
748 mutex_exit(&as->a_contents);
749 goto retry;
750 }
751 mutex_exit(&as->a_contents);
752 goto top;
753 } else {
754 /*
755 * We do not expect any other error return at this
756 * time. This is similar to an ASSERT in seg_unmap()
757 */
758 ASSERT(err == 0);
759 }
760 }
761 hat_free_end(hat);
762 if (as->a_xhat != NULL)
763 xhat_free_end_all(as);
764 AS_LOCK_EXIT(as, &as->a_lock);
765
766 /* /proc stuff */
767 ASSERT(avl_numnodes(&as->a_wpage) == 0);
768 if (as->a_objectdir) {
769 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
770 as->a_objectdir = NULL;
771 as->a_sizedir = 0;
772 }
773
774 /*
775 * Free the struct as back to kmem. Assert it has no segments.
776 */
777 ASSERT(avl_numnodes(&as->a_segtree) == 0);
778 kmem_cache_free(as_cache, as);
779 }
780
781 int
782 as_dup(struct as *as, struct proc *forkedproc)
783 {
784 struct as *newas;
785 struct seg *seg, *newseg;
786 size_t purgesize = 0;
787 int error;
788
789 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
790 as_clearwatch(as);
791 newas = as_alloc();
792 newas->a_userlimit = as->a_userlimit;
793 newas->a_proc = forkedproc;
794
795 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
796
797 /* This will prevent new XHATs from attaching */
798 mutex_enter(&as->a_contents);
799 AS_SETBUSY(as);
800 mutex_exit(&as->a_contents);
801 mutex_enter(&newas->a_contents);
802 AS_SETBUSY(newas);
803 mutex_exit(&newas->a_contents);
804
805 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
806
807 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
808
809 if (seg->s_flags & S_PURGE) {
810 purgesize += seg->s_size;
811 continue;
812 }
813
814 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
815 if (newseg == NULL) {
816 AS_LOCK_EXIT(newas, &newas->a_lock);
817 as_setwatch(as);
818 mutex_enter(&as->a_contents);
819 AS_CLRBUSY(as);
820 mutex_exit(&as->a_contents);
821 AS_LOCK_EXIT(as, &as->a_lock);
822 as_free(newas);
823 return (-1);
824 }
825 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
826 /*
827 * We call seg_free() on the new seg
828 * because the segment is not set up
829 * completely; i.e. it has no ops.
830 */
831 as_setwatch(as);
832 mutex_enter(&as->a_contents);
833 AS_CLRBUSY(as);
834 mutex_exit(&as->a_contents);
835 AS_LOCK_EXIT(as, &as->a_lock);
836 seg_free(newseg);
837 AS_LOCK_EXIT(newas, &newas->a_lock);
838 as_free(newas);
839 return (error);
840 }
841 newas->a_size += seg->s_size;
842 }
843 newas->a_resvsize = as->a_resvsize - purgesize;
844
845 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
846 if (as->a_xhat != NULL)
847 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
848
849 mutex_enter(&newas->a_contents);
850 AS_CLRBUSY(newas);
851 mutex_exit(&newas->a_contents);
852 AS_LOCK_EXIT(newas, &newas->a_lock);
853
854 as_setwatch(as);
855 mutex_enter(&as->a_contents);
856 AS_CLRBUSY(as);
857 mutex_exit(&as->a_contents);
858 AS_LOCK_EXIT(as, &as->a_lock);
859 if (error != 0) {
860 as_free(newas);
861 return (error);
862 }
863 forkedproc->p_as = newas;
864 return (0);
865 }
866
867 /*
868 * Handle a ``fault'' at addr for size bytes.
869 */
870 faultcode_t
871 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
872 enum fault_type type, enum seg_rw rw)
873 {
874 struct seg *seg;
875 caddr_t raddr; /* rounded down addr */
876 size_t rsize; /* rounded up size */
877 size_t ssize;
878 faultcode_t res = 0;
942 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
943 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
944 (size_t)raddr;
945
946 /*
947 * XXX -- Don't grab the as lock for segkmap. We should grab it for
948 * correctness, but then we could be stuck holding this lock for
949 * a LONG time if the fault needs to be resolved on a slow
950 * filesystem, and then no-one will be able to exec new commands,
951 * as exec'ing requires the write lock on the as.
952 */
953 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
954 raddr + size < segkmap->s_base + segkmap->s_size) {
955 /*
956 * if (as==&kas), this can't be XHAT: we've already returned
957 * FC_NOSUPPORT.
958 */
959 seg = segkmap;
960 as_lock_held = 0;
961 } else {
962 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
963 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
964 /*
965 * Grab and hold the writers' lock on the as
966 * if the fault is to a watched page.
967 * This will keep CPUs from "peeking" at the
968 * address range while we're temporarily boosting
969 * the permissions for the XHAT device to
970 * resolve the fault in the segment layer.
971 *
972 * We could check whether faulted address
973 * is within a watched page and only then grab
974 * the writer lock, but this is simpler.
975 */
976 AS_LOCK_EXIT(as, &as->a_lock);
977 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
978 }
979
980 seg = as_segat(as, raddr);
981 if (seg == NULL) {
982 AS_LOCK_EXIT(as, &as->a_lock);
983 if ((lwp != NULL) && (!is_xhat))
984 lwp->lwp_nostop--;
985 return (FC_NOMAP);
986 }
987
988 as_lock_held = 1;
989 }
990
991 addrsav = raddr;
992 segsav = seg;
993
994 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
995 if (raddr >= seg->s_base + seg->s_size) {
996 seg = AS_SEGNEXT(as, seg);
997 if (seg == NULL || raddr != seg->s_base) {
998 res = FC_NOMAP;
999 break;
1000 }
1001 }
1002 if (raddr + rsize > seg->s_base + seg->s_size)
1043 */
1044 if (res != 0 && type == F_SOFTLOCK) {
1045 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1046 if (addrsav >= seg->s_base + seg->s_size)
1047 seg = AS_SEGNEXT(as, seg);
1048 ASSERT(seg != NULL);
1049 /*
1050 * Now call the fault routine again to perform the
1051 * unlock using S_OTHER instead of the rw variable
1052 * since we never got a chance to touch the pages.
1053 */
1054 if (raddr > seg->s_base + seg->s_size)
1055 ssize = seg->s_base + seg->s_size - addrsav;
1056 else
1057 ssize = raddr - addrsav;
1058 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
1059 F_SOFTUNLOCK, S_OTHER);
1060 }
1061 }
1062 if (as_lock_held)
1063 AS_LOCK_EXIT(as, &as->a_lock);
1064 if ((lwp != NULL) && (!is_xhat))
1065 lwp->lwp_nostop--;
1066
1067 /*
1068 * If the lower levels returned EDEADLK for a fault,
1069 * It means that we should retry the fault. Let's wait
1070 * a bit also to let the deadlock causing condition clear.
1071 * This is part of a gross hack to work around a design flaw
1072 * in the ufs/sds logging code and should go away when the
1073 * logging code is re-designed to fix the problem. See bug
1074 * 4125102 for details of the problem.
1075 */
1076 if (FC_ERRNO(res) == EDEADLK) {
1077 delay(deadlk_wait);
1078 res = 0;
1079 goto retry;
1080 }
1081 return (res);
1082 }
1083
1091 {
1092 struct seg *seg;
1093 caddr_t raddr; /* rounded down addr */
1094 size_t rsize; /* rounded up size */
1095 faultcode_t res = 0;
1096 klwp_t *lwp = ttolwp(curthread);
1097
1098 retry:
1099 /*
1100 * Indicate that the lwp is not to be stopped while waiting
1101 * for a pagefault. This is to avoid deadlock while debugging
1102 * a process via /proc over NFS (in particular).
1103 */
1104 if (lwp != NULL)
1105 lwp->lwp_nostop++;
1106
1107 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1108 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1109 (size_t)raddr;
1110
1111 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1112 seg = as_segat(as, raddr);
1113 if (seg == NULL) {
1114 AS_LOCK_EXIT(as, &as->a_lock);
1115 if (lwp != NULL)
1116 lwp->lwp_nostop--;
1117 return (FC_NOMAP);
1118 }
1119
1120 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1121 if (raddr >= seg->s_base + seg->s_size) {
1122 seg = AS_SEGNEXT(as, seg);
1123 if (seg == NULL || raddr != seg->s_base) {
1124 res = FC_NOMAP;
1125 break;
1126 }
1127 }
1128 res = SEGOP_FAULTA(seg, raddr);
1129 if (res != 0)
1130 break;
1131 }
1132 AS_LOCK_EXIT(as, &as->a_lock);
1133 if (lwp != NULL)
1134 lwp->lwp_nostop--;
1135 /*
1136 * If the lower levels returned EDEADLK for a fault,
1137 * It means that we should retry the fault. Let's wait
1138 * a bit also to let the deadlock causing condition clear.
1139 * This is part of a gross hack to work around a design flaw
1140 * in the ufs/sds logging code and should go away when the
1141 * logging code is re-designed to fix the problem. See bug
1142 * 4125102 for details of the problem.
1143 */
1144 if (FC_ERRNO(res) == EDEADLK) {
1145 delay(deadlk_wait);
1146 res = 0;
1147 goto retry;
1148 }
1149 return (res);
1150 }
1151
1152 /*
1172 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1173 (size_t)raddr;
1174
1175 if (raddr + rsize < raddr) /* check for wraparound */
1176 return (ENOMEM);
1177
1178 saveraddr = raddr;
1179 saversize = rsize;
1180
1181 /*
1182 * Normally we only lock the as as a reader. But
1183 * if due to setprot the segment driver needs to split
1184 * a segment it will return IE_RETRY. Therefore we re-acquire
1185 * the as lock as a writer so the segment driver can change
1186 * the seg list. Also the segment driver will return IE_RETRY
1187 * after it has changed the segment list so we therefore keep
1188 * locking as a writer. Since these opeartions should be rare
1189 * want to only lock as a writer when necessary.
1190 */
1191 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1192 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1193 } else {
1194 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1195 }
1196
1197 as_clearwatchprot(as, raddr, rsize);
1198 seg = as_segat(as, raddr);
1199 if (seg == NULL) {
1200 as_setwatch(as);
1201 AS_LOCK_EXIT(as, &as->a_lock);
1202 return (ENOMEM);
1203 }
1204
1205 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1206 if (raddr >= seg->s_base + seg->s_size) {
1207 seg = AS_SEGNEXT(as, seg);
1208 if (seg == NULL || raddr != seg->s_base) {
1209 error = ENOMEM;
1210 break;
1211 }
1212 }
1213 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1214 ssize = seg->s_base + seg->s_size - raddr;
1215 else
1216 ssize = rsize;
1217 retry:
1218 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1219
1220 if (error == IE_NOMEM) {
1221 error = EAGAIN;
1222 break;
1223 }
1224
1225 if (error == IE_RETRY) {
1226 AS_LOCK_EXIT(as, &as->a_lock);
1227 writer = 1;
1228 goto setprot_top;
1229 }
1230
1231 if (error == EAGAIN) {
1232 /*
1233 * Make sure we have a_lock as writer.
1234 */
1235 if (writer == 0) {
1236 AS_LOCK_EXIT(as, &as->a_lock);
1237 writer = 1;
1238 goto setprot_top;
1239 }
1240
1241 /*
1242 * Memory is currently locked. It must be unlocked
1243 * before this operation can succeed through a retry.
1244 * The possible reasons for locked memory and
1245 * corresponding strategies for unlocking are:
1246 * (1) Normal I/O
1247 * wait for a signal that the I/O operation
1248 * has completed and the memory is unlocked.
1249 * (2) Asynchronous I/O
1250 * The aio subsystem does not unlock pages when
1251 * the I/O is completed. Those pages are unlocked
1252 * when the application calls aiowait/aioerror.
1253 * So, to prevent blocking forever, cv_broadcast()
1254 * is done to wake up aio_cleanup_thread.
1255 * Subsequently, segvn_reclaim will be called, and
1256 * that will do AS_CLRUNMAPWAIT() and wake us up.
1257 * (3) Long term page locking:
1258 * Drivers intending to have pages locked for a
1259 * period considerably longer than for normal I/O
1260 * (essentially forever) may have registered for a
1261 * callback so they may unlock these pages on
1262 * request. This is needed to allow this operation
1263 * to succeed. Each entry on the callback list is
1264 * examined. If the event or address range pertains
1265 * the callback is invoked (unless it already is in
1266 * progress). The a_contents lock must be dropped
1267 * before the callback, so only one callback can
1268 * be done at a time. Go to the top and do more
1269 * until zero is returned. If zero is returned,
1270 * either there were no callbacks for this event
1271 * or they were already in progress.
1272 */
1273 mutex_enter(&as->a_contents);
1274 if (as->a_callbacks &&
1275 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1276 seg->s_base, seg->s_size))) {
1277 AS_LOCK_EXIT(as, &as->a_lock);
1278 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1279 } else if (!AS_ISNOUNMAPWAIT(as)) {
1280 if (AS_ISUNMAPWAIT(as) == 0)
1281 cv_broadcast(&as->a_cv);
1282 AS_SETUNMAPWAIT(as);
1283 AS_LOCK_EXIT(as, &as->a_lock);
1284 while (AS_ISUNMAPWAIT(as))
1285 cv_wait(&as->a_cv, &as->a_contents);
1286 } else {
1287 /*
1288 * We may have raced with
1289 * segvn_reclaim()/segspt_reclaim(). In this
1290 * case clean nounmapwait flag and retry since
1291 * softlockcnt in this segment may be already
1292 * 0. We don't drop as writer lock so our
1293 * number of retries without sleeping should
1294 * be very small. See segvn_reclaim() for
1295 * more comments.
1296 */
1297 AS_CLRNOUNMAPWAIT(as);
1298 mutex_exit(&as->a_contents);
1299 goto retry;
1300 }
1301 mutex_exit(&as->a_contents);
1302 goto setprot_top;
1303 } else if (error != 0)
1304 break;
1305 }
1306 if (error != 0) {
1307 as_setwatch(as);
1308 } else {
1309 as_setwatchprot(as, saveraddr, saversize, prot);
1310 }
1311 AS_LOCK_EXIT(as, &as->a_lock);
1312 return (error);
1313 }
1314
1315 /*
1316 * Check to make sure that the interval [addr, addr + size)
1317 * in address space `as' has at least the specified protection.
1318 * It is ok for the range to cross over several segments, as long
1319 * as they are contiguous.
1320 */
1321 int
1322 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1323 {
1324 struct seg *seg;
1325 size_t ssize;
1326 caddr_t raddr; /* rounded down addr */
1327 size_t rsize; /* rounded up size */
1328 int error = 0;
1329
1330 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1331 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1332 (size_t)raddr;
1333
1334 if (raddr + rsize < raddr) /* check for wraparound */
1335 return (ENOMEM);
1336
1337 /*
1338 * This is ugly as sin...
1339 * Normally, we only acquire the address space readers lock.
1340 * However, if the address space has watchpoints present,
1341 * we must acquire the writer lock on the address space for
1342 * the benefit of as_clearwatchprot() and as_setwatchprot().
1343 */
1344 if (avl_numnodes(&as->a_wpage) != 0)
1345 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1346 else
1347 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1348 as_clearwatchprot(as, raddr, rsize);
1349 seg = as_segat(as, raddr);
1350 if (seg == NULL) {
1351 as_setwatch(as);
1352 AS_LOCK_EXIT(as, &as->a_lock);
1353 return (ENOMEM);
1354 }
1355
1356 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1357 if (raddr >= seg->s_base + seg->s_size) {
1358 seg = AS_SEGNEXT(as, seg);
1359 if (seg == NULL || raddr != seg->s_base) {
1360 error = ENOMEM;
1361 break;
1362 }
1363 }
1364 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1365 ssize = seg->s_base + seg->s_size - raddr;
1366 else
1367 ssize = rsize;
1368
1369 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1370 if (error != 0)
1371 break;
1372 }
1373 as_setwatch(as);
1374 AS_LOCK_EXIT(as, &as->a_lock);
1375 return (error);
1376 }
1377
1378 int
1379 as_unmap(struct as *as, caddr_t addr, size_t size)
1380 {
1381 struct seg *seg, *seg_next;
1382 struct as_callback *cb;
1383 caddr_t raddr, eaddr;
1384 size_t ssize, rsize = 0;
1385 int err;
1386
1387 top:
1388 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1389 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1390 (uintptr_t)PAGEMASK);
1391
1392 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1393
1394 as->a_updatedir = 1; /* inform /proc */
1395 gethrestime(&as->a_updatetime);
1396
1397 /*
1398 * Use as_findseg to find the first segment in the range, then
1399 * step through the segments in order, following s_next.
1400 */
1401 as_clearwatchprot(as, raddr, eaddr - raddr);
1402
1403 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1404 if (eaddr <= seg->s_base)
1405 break; /* eaddr was in a gap; all done */
1406
1407 /* this is implied by the test above */
1408 ASSERT(raddr < eaddr);
1409
1410 if (raddr < seg->s_base)
1411 raddr = seg->s_base; /* raddr was in a gap */
1412
1453 * (3) Long term page locking:
1454 * Drivers intending to have pages locked for a
1455 * period considerably longer than for normal I/O
1456 * (essentially forever) may have registered for a
1457 * callback so they may unlock these pages on
1458 * request. This is needed to allow this operation
1459 * to succeed. Each entry on the callback list is
1460 * examined. If the event or address range pertains
1461 * the callback is invoked (unless it already is in
1462 * progress). The a_contents lock must be dropped
1463 * before the callback, so only one callback can
1464 * be done at a time. Go to the top and do more
1465 * until zero is returned. If zero is returned,
1466 * either there were no callbacks for this event
1467 * or they were already in progress.
1468 */
1469 mutex_enter(&as->a_contents);
1470 if (as->a_callbacks &&
1471 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1472 seg->s_base, seg->s_size))) {
1473 AS_LOCK_EXIT(as, &as->a_lock);
1474 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1475 } else if (!AS_ISNOUNMAPWAIT(as)) {
1476 if (AS_ISUNMAPWAIT(as) == 0)
1477 cv_broadcast(&as->a_cv);
1478 AS_SETUNMAPWAIT(as);
1479 AS_LOCK_EXIT(as, &as->a_lock);
1480 while (AS_ISUNMAPWAIT(as))
1481 cv_wait(&as->a_cv, &as->a_contents);
1482 } else {
1483 /*
1484 * We may have raced with
1485 * segvn_reclaim()/segspt_reclaim(). In this
1486 * case clean nounmapwait flag and retry since
1487 * softlockcnt in this segment may be already
1488 * 0. We don't drop as writer lock so our
1489 * number of retries without sleeping should
1490 * be very small. See segvn_reclaim() for
1491 * more comments.
1492 */
1493 AS_CLRNOUNMAPWAIT(as);
1494 mutex_exit(&as->a_contents);
1495 goto retry;
1496 }
1497 mutex_exit(&as->a_contents);
1498 goto top;
1499 } else if (err == IE_RETRY) {
1500 AS_LOCK_EXIT(as, &as->a_lock);
1501 goto top;
1502 } else if (err) {
1503 as_setwatch(as);
1504 AS_LOCK_EXIT(as, &as->a_lock);
1505 return (-1);
1506 }
1507
1508 as->a_size -= ssize;
1509 if (rsize)
1510 as->a_resvsize -= rsize;
1511 raddr += ssize;
1512 }
1513 AS_LOCK_EXIT(as, &as->a_lock);
1514 return (0);
1515 }
1516
1517 static int
1518 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1519 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1520 {
1521 uint_t szc;
1522 uint_t nszc;
1523 int error;
1524 caddr_t a;
1525 caddr_t eaddr;
1526 size_t segsize;
1527 struct seg *seg;
1528 size_t pgsz;
1529 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1530 uint_t save_szcvec;
1531
1532 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1533 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1534 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1535 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1536 if (!do_off) {
1537 vn_a->offset = 0;
1538 }
1539
1540 if (szcvec <= 1) {
1541 seg = seg_alloc(as, addr, size);
1542 if (seg == NULL) {
1543 return (ENOMEM);
1544 }
1545 vn_a->szc = 0;
1546 error = (*crfp)(seg, vn_a);
1547 if (error != 0) {
1548 seg_free(seg);
1549 } else {
1550 as->a_size += size;
1551 as->a_resvsize += size;
1552 }
1626 ASSERT(addr == eaddr);
1627
1628 return (0);
1629 }
1630
1631 static int
1632 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1633 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1634 {
1635 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1636 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1637 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1638 type, 0);
1639 int error;
1640 struct seg *seg;
1641 struct vattr va;
1642 u_offset_t eoff;
1643 size_t save_size = 0;
1644 extern size_t textrepl_size_thresh;
1645
1646 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1647 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 ASSERT(vn_a->vp != NULL);
1650 ASSERT(vn_a->amp == NULL);
1651
1652 again:
1653 if (szcvec <= 1) {
1654 seg = seg_alloc(as, addr, size);
1655 if (seg == NULL) {
1656 return (ENOMEM);
1657 }
1658 vn_a->szc = 0;
1659 error = (*crfp)(seg, vn_a);
1660 if (error != 0) {
1661 seg_free(seg);
1662 } else {
1663 as->a_size += size;
1664 as->a_resvsize += size;
1665 }
1666 return (error);
1715 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1716 {
1717 uint_t szcvec;
1718 uchar_t type;
1719
1720 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1721 if (vn_a->type == MAP_SHARED) {
1722 type = MAPPGSZC_SHM;
1723 } else if (vn_a->type == MAP_PRIVATE) {
1724 if (vn_a->szc == AS_MAP_HEAP) {
1725 type = MAPPGSZC_HEAP;
1726 } else if (vn_a->szc == AS_MAP_STACK) {
1727 type = MAPPGSZC_STACK;
1728 } else {
1729 type = MAPPGSZC_PRIVM;
1730 }
1731 }
1732 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1733 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1734 (vn_a->flags & MAP_TEXT), type, 0);
1735 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
1736 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1737 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1738 ASSERT(vn_a->vp == NULL);
1739
1740 return (as_map_segvn_segs(as, addr, size, szcvec,
1741 crfp, vn_a, segcreated));
1742 }
1743
1744 int
1745 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1746 {
1747 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1748 return (as_map_locked(as, addr, size, crfp, argsp));
1749 }
1750
1751 int
1752 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1753 void *argsp)
1754 {
1755 struct seg *seg = NULL;
1756 caddr_t raddr; /* rounded down addr */
1757 size_t rsize; /* rounded up size */
1758 int error;
1759 int unmap = 0;
1760 struct proc *p = curproc;
1761 struct segvn_crargs crargs;
1762
1763 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1764 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1765 (size_t)raddr;
1766
1767 /*
1768 * check for wrap around
1769 */
1770 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1771 AS_LOCK_EXIT(as, &as->a_lock);
1772 return (ENOMEM);
1773 }
1774
1775 as->a_updatedir = 1; /* inform /proc */
1776 gethrestime(&as->a_updatetime);
1777
1778 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1779 AS_LOCK_EXIT(as, &as->a_lock);
1780
1781 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1782 RCA_UNSAFE_ALL);
1783
1784 return (ENOMEM);
1785 }
1786
1787 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1788 crargs = *(struct segvn_crargs *)argsp;
1789 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1790 if (error != 0) {
1791 AS_LOCK_EXIT(as, &as->a_lock);
1792 if (unmap) {
1793 (void) as_unmap(as, addr, size);
1794 }
1795 return (error);
1796 }
1797 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1798 crargs = *(struct segvn_crargs *)argsp;
1799 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1800 if (error != 0) {
1801 AS_LOCK_EXIT(as, &as->a_lock);
1802 if (unmap) {
1803 (void) as_unmap(as, addr, size);
1804 }
1805 return (error);
1806 }
1807 } else {
1808 seg = seg_alloc(as, addr, size);
1809 if (seg == NULL) {
1810 AS_LOCK_EXIT(as, &as->a_lock);
1811 return (ENOMEM);
1812 }
1813
1814 error = (*crfp)(seg, argsp);
1815 if (error != 0) {
1816 seg_free(seg);
1817 AS_LOCK_EXIT(as, &as->a_lock);
1818 return (error);
1819 }
1820 /*
1821 * Add size now so as_unmap will work if as_ctl fails.
1822 */
1823 as->a_size += rsize;
1824 as->a_resvsize += rsize;
1825 }
1826
1827 as_setwatch(as);
1828
1829 /*
1830 * If the address space is locked,
1831 * establish memory locks for the new segment.
1832 */
1833 mutex_enter(&as->a_contents);
1834 if (AS_ISPGLCK(as)) {
1835 mutex_exit(&as->a_contents);
1836 AS_LOCK_EXIT(as, &as->a_lock);
1837 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1838 if (error != 0)
1839 (void) as_unmap(as, addr, size);
1840 } else {
1841 mutex_exit(&as->a_contents);
1842 AS_LOCK_EXIT(as, &as->a_lock);
1843 }
1844 return (error);
1845 }
1846
1847
1848 /*
1849 * Delete all segments in the address space marked with S_PURGE.
1850 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1851 * These segments are deleted as a first step before calls to as_gap(), so
1852 * that they don't affect mmap() or shmat().
1853 */
1854 void
1855 as_purge(struct as *as)
1856 {
1857 struct seg *seg;
1858 struct seg *next_seg;
1859
1860 /*
1861 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1862 * no need to grab a_contents mutex for this check
1863 */
1864 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1865 return;
1866
1867 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
1868 next_seg = NULL;
1869 seg = AS_SEGFIRST(as);
1870 while (seg != NULL) {
1871 next_seg = AS_SEGNEXT(as, seg);
1872 if (seg->s_flags & S_PURGE)
1873 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1874 seg = next_seg;
1875 }
1876 AS_LOCK_EXIT(as, &as->a_lock);
1877
1878 mutex_enter(&as->a_contents);
1879 as->a_flags &= ~AS_NEEDSPURGE;
1880 mutex_exit(&as->a_contents);
1881 }
1882
1883 /*
1884 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1885 * range of addresses at least "minlen" long, where the base of the range is
1886 * at "off" phase from an "align" boundary and there is space for a
1887 * "redzone"-sized redzone on eithe rside of the range. Thus,
1888 * if align was 4M and off was 16k, the user wants a hole which will start
1889 * 16k into a 4M page.
1890 *
1891 * If flags specifies AH_HI, the hole will have the highest possible address
1892 * in the range. We use the as->a_lastgap field to figure out where to
1893 * start looking for a gap.
1894 *
1895 * Otherwise, the gap will have the lowest possible address.
1896 *
1919 save_base = *basep;
1920 save_len = *lenp;
1921 save_minlen = minlen;
1922 save_redzone = redzone;
1923
1924 /*
1925 * For the first pass/fast_path, just add align and redzone into
1926 * minlen since if we get an allocation, we can guarantee that it
1927 * will fit the alignment and redzone requested.
1928 * This increases the chance that hibound will be adjusted to
1929 * a_lastgap->s_base which will likely allow us to find an
1930 * acceptable hole in the address space quicker.
1931 * If we can't find a hole with this fast_path, then we look for
1932 * smaller holes in which the alignment and offset may allow
1933 * the allocation to fit.
1934 */
1935 minlen += align;
1936 minlen += 2 * redzone;
1937 redzone = 0;
1938
1939 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1940 if (AS_SEGFIRST(as) == NULL) {
1941 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1942 align, redzone, off)) {
1943 AS_LOCK_EXIT(as, &as->a_lock);
1944 return (0);
1945 } else {
1946 AS_LOCK_EXIT(as, &as->a_lock);
1947 *basep = save_base;
1948 *lenp = save_len;
1949 return (-1);
1950 }
1951 }
1952
1953 retry:
1954 /*
1955 * Set up to iterate over all the inter-segment holes in the given
1956 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1957 * NULL for the highest-addressed hole. If moving backwards, we reset
1958 * sseg to denote the highest-addressed segment.
1959 */
1960 forward = (flags & AH_DIR) == AH_LO;
1961 if (forward) {
1962 hseg = as_findseg(as, lobound, 1);
1963 lseg = AS_SEGPREV(as, hseg);
1964 } else {
1965
1966 /*
2007 lo = lobound;
2008 if (hi > hibound)
2009 hi = hibound;
2010 /*
2011 * Verify that the candidate hole is big enough and meets
2012 * hardware constraints. If the hole is too small, no need
2013 * to do the further checks since they will fail.
2014 */
2015 *basep = lo;
2016 *lenp = hi - lo;
2017 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2018 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2019 ((flags & AH_CONTAIN) == 0 ||
2020 (*basep <= addr && *basep + *lenp > addr))) {
2021 if (!forward)
2022 as->a_lastgap = hseg;
2023 if (hseg != NULL)
2024 as->a_lastgaphl = hseg;
2025 else
2026 as->a_lastgaphl = lseg;
2027 AS_LOCK_EXIT(as, &as->a_lock);
2028 return (0);
2029 }
2030 cont:
2031 /*
2032 * Move to the next hole.
2033 */
2034 if (forward) {
2035 lseg = hseg;
2036 if (lseg == NULL)
2037 break;
2038 hseg = AS_SEGNEXT(as, hseg);
2039 } else {
2040 hseg = lseg;
2041 if (hseg == NULL)
2042 break;
2043 lseg = AS_SEGPREV(as, lseg);
2044 }
2045 }
2046 if (fast_path && (align != 0 || save_redzone != 0)) {
2047 fast_path = 0;
2048 minlen = save_minlen;
2049 redzone = save_redzone;
2050 goto retry;
2051 }
2052 *basep = save_base;
2053 *lenp = save_len;
2054 AS_LOCK_EXIT(as, &as->a_lock);
2055 return (-1);
2056 }
2057
2058 /*
2059 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2060 *
2061 * If flags specifies AH_HI, the hole will have the highest possible address
2062 * in the range. We use the as->a_lastgap field to figure out where to
2063 * start looking for a gap.
2064 *
2065 * Otherwise, the gap will have the lowest possible address.
2066 *
2067 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2068 *
2069 * If an adequate hole is found, base and len are set to reflect the part of
2070 * the hole that is within range, and 0 is returned, otherwise,
2071 * -1 is returned.
2072 *
2073 * NOTE: This routine is not correct when base+len overflows caddr_t.
2074 */
2076 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2077 caddr_t addr)
2078 {
2079
2080 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2081 }
2082
2083 /*
2084 * Return the next range within [base, base + len) that is backed
2085 * with "real memory". Skip holes and non-seg_vn segments.
2086 * We're lazy and only return one segment at a time.
2087 */
2088 int
2089 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2090 {
2091 extern struct seg_ops segspt_shmops; /* needs a header file */
2092 struct seg *seg;
2093 caddr_t addr, eaddr;
2094 caddr_t segend;
2095
2096 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2097
2098 addr = *basep;
2099 eaddr = addr + *lenp;
2100
2101 seg = as_findseg(as, addr, 0);
2102 if (seg != NULL)
2103 addr = MAX(seg->s_base, addr);
2104
2105 for (;;) {
2106 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2107 AS_LOCK_EXIT(as, &as->a_lock);
2108 return (EINVAL);
2109 }
2110
2111 if (seg->s_ops == &segvn_ops) {
2112 segend = seg->s_base + seg->s_size;
2113 break;
2114 }
2115
2116 /*
2117 * We do ISM by looking into the private data
2118 * to determine the real size of the segment.
2119 */
2120 if (seg->s_ops == &segspt_shmops) {
2121 segend = seg->s_base + spt_realsize(seg);
2122 if (addr < segend)
2123 break;
2124 }
2125
2126 seg = AS_SEGNEXT(as, seg);
2127
2128 if (seg != NULL)
2129 addr = seg->s_base;
2130 }
2131
2132 *basep = addr;
2133
2134 if (segend > eaddr)
2135 *lenp = eaddr - addr;
2136 else
2137 *lenp = segend - addr;
2138
2139 AS_LOCK_EXIT(as, &as->a_lock);
2140 return (0);
2141 }
2142
2143 /*
2144 * Swap the pages associated with the address space as out to
2145 * secondary storage, returning the number of bytes actually
2146 * swapped.
2147 *
2148 * The value returned is intended to correlate well with the process's
2149 * memory requirements. Its usefulness for this purpose depends on
2150 * how well the segment-level routines do at returning accurate
2151 * information.
2152 */
2153 size_t
2154 as_swapout(struct as *as)
2155 {
2156 struct seg *seg;
2157 size_t swpcnt = 0;
2158
2159 /*
2160 * Kernel-only processes have given up their address
2161 * spaces. Of course, we shouldn't be attempting to
2162 * swap out such processes in the first place...
2163 */
2164 if (as == NULL)
2165 return (0);
2166
2167 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2168
2169 /* Prevent XHATs from attaching */
2170 mutex_enter(&as->a_contents);
2171 AS_SETBUSY(as);
2172 mutex_exit(&as->a_contents);
2173
2174
2175 /*
2176 * Free all mapping resources associated with the address
2177 * space. The segment-level swapout routines capitalize
2178 * on this unmapping by scavanging pages that have become
2179 * unmapped here.
2180 */
2181 hat_swapout(as->a_hat);
2182 if (as->a_xhat != NULL)
2183 xhat_swapout_all(as);
2184
2185 mutex_enter(&as->a_contents);
2186 AS_CLRBUSY(as);
2187 mutex_exit(&as->a_contents);
2188
2189 /*
2190 * Call the swapout routines of all segments in the address
2191 * space to do the actual work, accumulating the amount of
2192 * space reclaimed.
2193 */
2194 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2195 struct seg_ops *ov = seg->s_ops;
2196
2197 /*
2198 * We have to check to see if the seg has
2199 * an ops vector because the seg may have
2200 * been in the middle of being set up when
2201 * the process was picked for swapout.
2202 */
2203 if ((ov != NULL) && (ov->swapout != NULL))
2204 swpcnt += SEGOP_SWAPOUT(seg);
2205 }
2206 AS_LOCK_EXIT(as, &as->a_lock);
2207 return (swpcnt);
2208 }
2209
2210 /*
2211 * Determine whether data from the mappings in interval [addr, addr + size)
2212 * are in the primary memory (core) cache.
2213 */
2214 int
2215 as_incore(struct as *as, caddr_t addr,
2216 size_t size, char *vec, size_t *sizep)
2217 {
2218 struct seg *seg;
2219 size_t ssize;
2220 caddr_t raddr; /* rounded down addr */
2221 size_t rsize; /* rounded up size */
2222 size_t isize; /* iteration size */
2223 int error = 0; /* result, assume success */
2224
2225 *sizep = 0;
2226 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2227 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2228 (size_t)raddr;
2229
2230 if (raddr + rsize < raddr) /* check for wraparound */
2231 return (ENOMEM);
2232
2233 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2234 seg = as_segat(as, raddr);
2235 if (seg == NULL) {
2236 AS_LOCK_EXIT(as, &as->a_lock);
2237 return (-1);
2238 }
2239
2240 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2241 if (raddr >= seg->s_base + seg->s_size) {
2242 seg = AS_SEGNEXT(as, seg);
2243 if (seg == NULL || raddr != seg->s_base) {
2244 error = -1;
2245 break;
2246 }
2247 }
2248 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2249 ssize = seg->s_base + seg->s_size - raddr;
2250 else
2251 ssize = rsize;
2252 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2253 if (isize != ssize) {
2254 error = -1;
2255 break;
2256 }
2257 vec += btopr(ssize);
2258 }
2259 AS_LOCK_EXIT(as, &as->a_lock);
2260 return (error);
2261 }
2262
2263 static void
2264 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2265 ulong_t *bitmap, size_t position, size_t npages)
2266 {
2267 caddr_t range_start;
2268 size_t pos1 = position;
2269 size_t pos2;
2270 size_t size;
2271 size_t end_pos = npages + position;
2272
2273 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2274 size = ptob((pos2 - pos1));
2275 range_start = (caddr_t)((uintptr_t)addr +
2276 ptob(pos1 - position));
2277
2278 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2279 (ulong_t *)NULL, (size_t)NULL);
2309 * address space "as".
2310 */
2311 /*ARGSUSED*/
2312 int
2313 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2314 uintptr_t arg, ulong_t *lock_map, size_t pos)
2315 {
2316 struct seg *seg; /* working segment */
2317 caddr_t raddr; /* rounded down addr */
2318 caddr_t initraddr; /* saved initial rounded down addr */
2319 size_t rsize; /* rounded up size */
2320 size_t initrsize; /* saved initial rounded up size */
2321 size_t ssize; /* size of seg */
2322 int error = 0; /* result */
2323 size_t mlock_size; /* size of bitmap */
2324 ulong_t *mlock_map; /* pointer to bitmap used */
2325 /* to represent the locked */
2326 /* pages. */
2327 retry:
2328 if (error == IE_RETRY)
2329 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2330 else
2331 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2332
2333 /*
2334 * If these are address space lock/unlock operations, loop over
2335 * all segments in the address space, as appropriate.
2336 */
2337 if (func == MC_LOCKAS) {
2338 size_t npages, idx;
2339 size_t rlen = 0; /* rounded as length */
2340
2341 idx = pos;
2342
2343 if (arg & MCL_FUTURE) {
2344 mutex_enter(&as->a_contents);
2345 AS_SETPGLCK(as);
2346 mutex_exit(&as->a_contents);
2347 }
2348 if ((arg & MCL_CURRENT) == 0) {
2349 AS_LOCK_EXIT(as, &as->a_lock);
2350 return (0);
2351 }
2352
2353 seg = AS_SEGFIRST(as);
2354 if (seg == NULL) {
2355 AS_LOCK_EXIT(as, &as->a_lock);
2356 return (0);
2357 }
2358
2359 do {
2360 raddr = (caddr_t)((uintptr_t)seg->s_base &
2361 (uintptr_t)PAGEMASK);
2362 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2363 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2364 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2365
2366 mlock_size = BT_BITOUL(btopr(rlen));
2367 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2368 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2369 AS_LOCK_EXIT(as, &as->a_lock);
2370 return (EAGAIN);
2371 }
2372
2373 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2374 error = SEGOP_LOCKOP(seg, seg->s_base,
2375 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2376 if (error != 0)
2377 break;
2378 pos += seg_pages(seg);
2379 }
2380
2381 if (error) {
2382 for (seg = AS_SEGFIRST(as); seg != NULL;
2383 seg = AS_SEGNEXT(as, seg)) {
2384
2385 raddr = (caddr_t)((uintptr_t)seg->s_base &
2386 (uintptr_t)PAGEMASK);
2387 npages = seg_pages(seg);
2388 as_segunlock(seg, raddr, attr, mlock_map,
2389 idx, npages);
2390 idx += npages;
2391 }
2392 }
2393
2394 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2395 AS_LOCK_EXIT(as, &as->a_lock);
2396 goto lockerr;
2397 } else if (func == MC_UNLOCKAS) {
2398 mutex_enter(&as->a_contents);
2399 AS_CLRPGLCK(as);
2400 mutex_exit(&as->a_contents);
2401
2402 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2403 error = SEGOP_LOCKOP(seg, seg->s_base,
2404 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2405 if (error != 0)
2406 break;
2407 }
2408
2409 AS_LOCK_EXIT(as, &as->a_lock);
2410 goto lockerr;
2411 }
2412
2413 /*
2414 * Normalize addresses and sizes.
2415 */
2416 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2417 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2418 (size_t)raddr;
2419
2420 if (raddr + rsize < raddr) { /* check for wraparound */
2421 AS_LOCK_EXIT(as, &as->a_lock);
2422 return (ENOMEM);
2423 }
2424
2425 /*
2426 * Get initial segment.
2427 */
2428 if ((seg = as_segat(as, raddr)) == NULL) {
2429 AS_LOCK_EXIT(as, &as->a_lock);
2430 return (ENOMEM);
2431 }
2432
2433 if (func == MC_LOCK) {
2434 mlock_size = BT_BITOUL(btopr(rsize));
2435 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2436 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2437 AS_LOCK_EXIT(as, &as->a_lock);
2438 return (EAGAIN);
2439 }
2440 }
2441
2442 /*
2443 * Loop over all segments. If a hole in the address range is
2444 * discovered, then fail. For each segment, perform the appropriate
2445 * control operation.
2446 */
2447 while (rsize != 0) {
2448
2449 /*
2450 * Make sure there's no hole, calculate the portion
2451 * of the next segment to be operated over.
2452 */
2453 if (raddr >= seg->s_base + seg->s_size) {
2454 seg = AS_SEGNEXT(as, seg);
2455 if (seg == NULL || raddr != seg->s_base) {
2456 if (func == MC_LOCK) {
2457 as_unlockerr(as, attr, mlock_map,
2458 initraddr, initrsize - rsize);
2459 kmem_free(mlock_map,
2460 mlock_size * sizeof (ulong_t));
2461 }
2462 AS_LOCK_EXIT(as, &as->a_lock);
2463 return (ENOMEM);
2464 }
2465 }
2466 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2467 ssize = seg->s_base + seg->s_size - raddr;
2468 else
2469 ssize = rsize;
2470
2471 /*
2472 * Dispatch on specific function.
2473 */
2474 switch (func) {
2475
2476 /*
2477 * Synchronize cached data from mappings with backing
2478 * objects.
2479 */
2480 case MC_SYNC:
2481 if (error = SEGOP_SYNC(seg, raddr, ssize,
2482 attr, (uint_t)arg)) {
2483 AS_LOCK_EXIT(as, &as->a_lock);
2484 return (error);
2485 }
2486 break;
2487
2488 /*
2489 * Lock pages in memory.
2490 */
2491 case MC_LOCK:
2492 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2493 attr, func, mlock_map, pos)) {
2494 as_unlockerr(as, attr, mlock_map, initraddr,
2495 initrsize - rsize + ssize);
2496 kmem_free(mlock_map, mlock_size *
2497 sizeof (ulong_t));
2498 AS_LOCK_EXIT(as, &as->a_lock);
2499 goto lockerr;
2500 }
2501 break;
2502
2503 /*
2504 * Unlock mapped pages.
2505 */
2506 case MC_UNLOCK:
2507 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2508 (ulong_t *)NULL, (size_t)NULL);
2509 break;
2510
2511 /*
2512 * Store VM advise for mapped pages in segment layer.
2513 */
2514 case MC_ADVISE:
2515 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2516
2517 /*
2518 * Check for regular errors and special retry error
2519 */
2520 if (error) {
2521 if (error == IE_RETRY) {
2522 /*
2523 * Need to acquire writers lock, so
2524 * have to drop readers lock and start
2525 * all over again
2526 */
2527 AS_LOCK_EXIT(as, &as->a_lock);
2528 goto retry;
2529 } else if (error == IE_REATTACH) {
2530 /*
2531 * Find segment for current address
2532 * because current segment just got
2533 * split or concatenated
2534 */
2535 seg = as_segat(as, raddr);
2536 if (seg == NULL) {
2537 AS_LOCK_EXIT(as, &as->a_lock);
2538 return (ENOMEM);
2539 }
2540 } else {
2541 /*
2542 * Regular error
2543 */
2544 AS_LOCK_EXIT(as, &as->a_lock);
2545 return (error);
2546 }
2547 }
2548 break;
2549
2550 case MC_INHERIT_ZERO:
2551 if (seg->s_ops->inherit == NULL) {
2552 error = ENOTSUP;
2553 } else {
2554 error = SEGOP_INHERIT(seg, raddr, ssize,
2555 SEGP_INH_ZERO);
2556 }
2557 if (error != 0) {
2558 AS_LOCK_EXIT(as, &as->a_lock);
2559 return (error);
2560 }
2561 break;
2562
2563 /*
2564 * Can't happen.
2565 */
2566 default:
2567 panic("as_ctl: bad operation %d", func);
2568 /*NOTREACHED*/
2569 }
2570
2571 rsize -= ssize;
2572 raddr += ssize;
2573 }
2574
2575 if (func == MC_LOCK)
2576 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2577 AS_LOCK_EXIT(as, &as->a_lock);
2578 return (0);
2579 lockerr:
2580
2581 /*
2582 * If the lower levels returned EDEADLK for a segment lockop,
2583 * it means that we should retry the operation. Let's wait
2584 * a bit also to let the deadlock causing condition clear.
2585 * This is part of a gross hack to work around a design flaw
2586 * in the ufs/sds logging code and should go away when the
2587 * logging code is re-designed to fix the problem. See bug
2588 * 4125102 for details of the problem.
2589 */
2590 if (error == EDEADLK) {
2591 delay(deadlk_wait);
2592 error = 0;
2593 goto retry;
2594 }
2595 return (error);
2596 }
2597
2622 */
2623 static int
2624 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2625 caddr_t addr, size_t size, enum seg_rw rw)
2626 {
2627 caddr_t sv_addr = addr;
2628 size_t sv_size = size;
2629 struct seg *sv_seg = seg;
2630 ulong_t segcnt = 1;
2631 ulong_t cnt;
2632 size_t ssize;
2633 pgcnt_t npages = btop(size);
2634 page_t **plist;
2635 page_t **pl;
2636 int error;
2637 caddr_t eaddr;
2638 faultcode_t fault_err = 0;
2639 pgcnt_t pl_off;
2640 extern struct seg_ops segspt_shmops;
2641
2642 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2643 ASSERT(seg != NULL);
2644 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2645 ASSERT(addr + size > seg->s_base + seg->s_size);
2646 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2647 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2648
2649 /*
2650 * Count the number of segments covered by the range we are about to
2651 * lock. The segment count is used to size the shadow list we return
2652 * back to the caller.
2653 */
2654 for (; size != 0; size -= ssize, addr += ssize) {
2655 if (addr >= seg->s_base + seg->s_size) {
2656
2657 seg = AS_SEGNEXT(as, seg);
2658 if (seg == NULL || addr != seg->s_base) {
2659 AS_LOCK_EXIT(as, &as->a_lock);
2660 return (EFAULT);
2661 }
2662 /*
2663 * Do a quick check if subsequent segments
2664 * will most likely support pagelock.
2665 */
2666 if (seg->s_ops == &segvn_ops) {
2667 vnode_t *vp;
2668
2669 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2670 vp != NULL) {
2671 AS_LOCK_EXIT(as, &as->a_lock);
2672 goto slow;
2673 }
2674 } else if (seg->s_ops != &segspt_shmops) {
2675 AS_LOCK_EXIT(as, &as->a_lock);
2676 goto slow;
2677 }
2678 segcnt++;
2679 }
2680 if (addr + size > seg->s_base + seg->s_size) {
2681 ssize = seg->s_base + seg->s_size - addr;
2682 } else {
2683 ssize = size;
2684 }
2685 }
2686 ASSERT(segcnt > 1);
2687
2688 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2689
2690 addr = sv_addr;
2691 size = sv_size;
2692 seg = sv_seg;
2693
2694 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2695 if (addr >= seg->s_base + seg->s_size) {
2700 }
2701 if (addr + size > seg->s_base + seg->s_size) {
2702 ssize = seg->s_base + seg->s_size - addr;
2703 } else {
2704 ssize = size;
2705 }
2706 pl = &plist[npages + cnt];
2707 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2708 L_PAGELOCK, rw);
2709 if (error) {
2710 break;
2711 }
2712 ASSERT(plist[npages + cnt] != NULL);
2713 ASSERT(pl_off + btop(ssize) <= npages);
2714 bcopy(plist[npages + cnt], &plist[pl_off],
2715 btop(ssize) * sizeof (page_t *));
2716 pl_off += btop(ssize);
2717 }
2718
2719 if (size == 0) {
2720 AS_LOCK_EXIT(as, &as->a_lock);
2721 ASSERT(cnt == segcnt - 1);
2722 *ppp = plist;
2723 return (0);
2724 }
2725
2726 /*
2727 * one of pagelock calls failed. The error type is in error variable.
2728 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2729 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2730 * back to the caller.
2731 */
2732
2733 eaddr = addr;
2734 seg = sv_seg;
2735
2736 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2737 if (addr >= seg->s_base + seg->s_size) {
2738 seg = AS_SEGNEXT(as, seg);
2739 ASSERT(seg != NULL && addr == seg->s_base);
2740 cnt++;
2741 ASSERT(cnt < segcnt);
2742 }
2743 if (eaddr > seg->s_base + seg->s_size) {
2744 ssize = seg->s_base + seg->s_size - addr;
2745 } else {
2746 ssize = eaddr - addr;
2747 }
2748 pl = &plist[npages + cnt];
2749 ASSERT(*pl != NULL);
2750 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2751 L_PAGEUNLOCK, rw);
2752 }
2753
2754 AS_LOCK_EXIT(as, &as->a_lock);
2755
2756 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2757
2758 if (error != ENOTSUP && error != EFAULT) {
2759 return (error);
2760 }
2761
2762 slow:
2763 /*
2764 * If we are here because pagelock failed due to the need to cow fault
2765 * in the pages we want to lock F_SOFTLOCK will do this job and in
2766 * next as_pagelock() call for this address range pagelock will
2767 * hopefully succeed.
2768 */
2769 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2770 if (fault_err != 0) {
2771 return (fc_decode(fault_err));
2772 }
2773 *ppp = NULL;
2774
2783 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2784 size_t size, enum seg_rw rw)
2785 {
2786 size_t rsize;
2787 caddr_t raddr;
2788 faultcode_t fault_err;
2789 struct seg *seg;
2790 int err;
2791
2792 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2793 "as_pagelock_start: addr %p size %ld", addr, size);
2794
2795 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2796 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2797 (size_t)raddr;
2798
2799 /*
2800 * if the request crosses two segments let
2801 * as_fault handle it.
2802 */
2803 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2804
2805 seg = as_segat(as, raddr);
2806 if (seg == NULL) {
2807 AS_LOCK_EXIT(as, &as->a_lock);
2808 return (EFAULT);
2809 }
2810 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2811 if (raddr + rsize > seg->s_base + seg->s_size) {
2812 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2813 }
2814 if (raddr + rsize <= raddr) {
2815 AS_LOCK_EXIT(as, &as->a_lock);
2816 return (EFAULT);
2817 }
2818
2819 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2820 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2821
2822 /*
2823 * try to lock pages and pass back shadow list
2824 */
2825 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2826
2827 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2828
2829 AS_LOCK_EXIT(as, &as->a_lock);
2830
2831 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2832 return (err);
2833 }
2834
2835 /*
2836 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2837 * to no pagelock support for this segment or pages need to be cow
2838 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2839 * this as_pagelock() call and in the next as_pagelock() call for the
2840 * same address range pagelock call will hopefull succeed.
2841 */
2842 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2843 if (fault_err != 0) {
2844 return (fc_decode(fault_err));
2845 }
2846 *ppp = NULL;
2847
2848 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2849 return (0);
2850 }
2851
2852 /*
2853 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2854 * lists from the end of plist and call pageunlock interface for each segment.
2855 * Drop as lock and free plist.
2856 */
2857 static void
2858 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2859 struct page **plist, enum seg_rw rw)
2860 {
2861 ulong_t cnt;
2862 caddr_t eaddr = addr + size;
2863 pgcnt_t npages = btop(size);
2864 size_t ssize;
2865 page_t **pl;
2866
2867 ASSERT(AS_LOCK_HELD(as, &as->a_lock));
2868 ASSERT(seg != NULL);
2869 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2870 ASSERT(addr + size > seg->s_base + seg->s_size);
2871 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2872 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2873 ASSERT(plist != NULL);
2874
2875 for (cnt = 0; addr < eaddr; addr += ssize) {
2876 if (addr >= seg->s_base + seg->s_size) {
2877 seg = AS_SEGNEXT(as, seg);
2878 ASSERT(seg != NULL && addr == seg->s_base);
2879 cnt++;
2880 }
2881 if (eaddr > seg->s_base + seg->s_size) {
2882 ssize = seg->s_base + seg->s_size - addr;
2883 } else {
2884 ssize = eaddr - addr;
2885 }
2886 pl = &plist[npages + cnt];
2887 ASSERT(*pl != NULL);
2888 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2889 L_PAGEUNLOCK, rw);
2890 }
2891 ASSERT(cnt > 0);
2892 AS_LOCK_EXIT(as, &as->a_lock);
2893
2894 cnt++;
2895 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2896 }
2897
2898 /*
2899 * unlock pages in a given address range
2900 */
2901 void
2902 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2903 enum seg_rw rw)
2904 {
2905 struct seg *seg;
2906 size_t rsize;
2907 caddr_t raddr;
2908
2909 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2910 "as_pageunlock_start: addr %p size %ld", addr, size);
2911
2912 /*
2913 * if the shadow list is NULL, as_pagelock was
2914 * falling back to as_fault
2915 */
2916 if (pp == NULL) {
2917 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2918 return;
2919 }
2920
2921 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2922 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2923 (size_t)raddr;
2924
2925 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
2926 seg = as_segat(as, raddr);
2927 ASSERT(seg != NULL);
2928
2929 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2930 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2931
2932 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2933 if (raddr + rsize <= seg->s_base + seg->s_size) {
2934 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2935 } else {
2936 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2937 return;
2938 }
2939 AS_LOCK_EXIT(as, &as->a_lock);
2940 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2941 }
2942
2943 int
2944 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2945 boolean_t wait)
2946 {
2947 struct seg *seg;
2948 size_t ssize;
2949 caddr_t raddr; /* rounded down addr */
2950 size_t rsize; /* rounded up size */
2951 int error = 0;
2952 size_t pgsz = page_get_pagesize(szc);
2953
2954 setpgsz_top:
2955 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2956 return (EINVAL);
2957 }
2958
2959 raddr = addr;
2960 rsize = size;
2961
2962 if (raddr + rsize < raddr) /* check for wraparound */
2963 return (ENOMEM);
2964
2965 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
2966 as_clearwatchprot(as, raddr, rsize);
2967 seg = as_segat(as, raddr);
2968 if (seg == NULL) {
2969 as_setwatch(as);
2970 AS_LOCK_EXIT(as, &as->a_lock);
2971 return (ENOMEM);
2972 }
2973
2974 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2975 if (raddr >= seg->s_base + seg->s_size) {
2976 seg = AS_SEGNEXT(as, seg);
2977 if (seg == NULL || raddr != seg->s_base) {
2978 error = ENOMEM;
2979 break;
2980 }
2981 }
2982 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2983 ssize = seg->s_base + seg->s_size - raddr;
2984 } else {
2985 ssize = rsize;
2986 }
2987
2988 retry:
2989 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2990
2991 if (error == IE_NOMEM) {
2992 error = EAGAIN;
2993 break;
2994 }
2995
2996 if (error == IE_RETRY) {
2997 AS_LOCK_EXIT(as, &as->a_lock);
2998 goto setpgsz_top;
2999 }
3000
3001 if (error == ENOTSUP) {
3002 error = EINVAL;
3003 break;
3004 }
3005
3006 if (wait && (error == EAGAIN)) {
3007 /*
3008 * Memory is currently locked. It must be unlocked
3009 * before this operation can succeed through a retry.
3010 * The possible reasons for locked memory and
3011 * corresponding strategies for unlocking are:
3012 * (1) Normal I/O
3013 * wait for a signal that the I/O operation
3014 * has completed and the memory is unlocked.
3015 * (2) Asynchronous I/O
3016 * The aio subsystem does not unlock pages when
3017 * the I/O is completed. Those pages are unlocked
3018 * when the application calls aiowait/aioerror.
3019 * So, to prevent blocking forever, cv_broadcast()
3020 * is done to wake up aio_cleanup_thread.
3021 * Subsequently, segvn_reclaim will be called, and
3022 * that will do AS_CLRUNMAPWAIT() and wake us up.
3023 * (3) Long term page locking:
3024 * This is not relevant for as_setpagesize()
3025 * because we cannot change the page size for
3026 * driver memory. The attempt to do so will
3027 * fail with a different error than EAGAIN so
3028 * there's no need to trigger as callbacks like
3029 * as_unmap, as_setprot or as_free would do.
3030 */
3031 mutex_enter(&as->a_contents);
3032 if (!AS_ISNOUNMAPWAIT(as)) {
3033 if (AS_ISUNMAPWAIT(as) == 0) {
3034 cv_broadcast(&as->a_cv);
3035 }
3036 AS_SETUNMAPWAIT(as);
3037 AS_LOCK_EXIT(as, &as->a_lock);
3038 while (AS_ISUNMAPWAIT(as)) {
3039 cv_wait(&as->a_cv, &as->a_contents);
3040 }
3041 } else {
3042 /*
3043 * We may have raced with
3044 * segvn_reclaim()/segspt_reclaim(). In this
3045 * case clean nounmapwait flag and retry since
3046 * softlockcnt in this segment may be already
3047 * 0. We don't drop as writer lock so our
3048 * number of retries without sleeping should
3049 * be very small. See segvn_reclaim() for
3050 * more comments.
3051 */
3052 AS_CLRNOUNMAPWAIT(as);
3053 mutex_exit(&as->a_contents);
3054 goto retry;
3055 }
3056 mutex_exit(&as->a_contents);
3057 goto setpgsz_top;
3058 } else if (error != 0) {
3059 break;
3060 }
3061 }
3062 as_setwatch(as);
3063 AS_LOCK_EXIT(as, &as->a_lock);
3064 return (error);
3065 }
3066
3067 /*
3068 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3069 * in its chunk where s_szc is less than the szc we want to set.
3070 */
3071 static int
3072 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3073 int *retry)
3074 {
3075 struct seg *seg;
3076 size_t ssize;
3077 int error;
3078
3079 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3080
3081 seg = as_segat(as, raddr);
3082 if (seg == NULL) {
3083 panic("as_iset3_default_lpsize: no seg");
3084 }
3085
3086 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3087 if (raddr >= seg->s_base + seg->s_size) {
3088 seg = AS_SEGNEXT(as, seg);
3089 if (seg == NULL || raddr != seg->s_base) {
3090 panic("as_iset3_default_lpsize: as changed");
3091 }
3092 }
3093 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3094 ssize = seg->s_base + seg->s_size - raddr;
3095 } else {
3096 ssize = rsize;
3097 }
3098
3099 if (szc > seg->s_szc) {
3116 }
3117 return (0);
3118 }
3119
3120 /*
3121 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3122 * pagesize on each segment in its range, but if any fails with EINVAL,
3123 * then it reduces the pagesizes to the next size in the bitmap and
3124 * retries as_iset3_default_lpsize(). The reason why the code retries
3125 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3126 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3127 * with) to pass to map_pgszcvec().
3128 */
3129 static int
3130 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3131 uint_t szcvec)
3132 {
3133 int error;
3134 int retry;
3135
3136 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3137
3138 for (;;) {
3139 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3140 if (error == EINVAL && retry) {
3141 szcvec &= ~(1 << szc);
3142 if (szcvec <= 1) {
3143 return (EINVAL);
3144 }
3145 szc = highbit(szcvec) - 1;
3146 } else {
3147 return (error);
3148 }
3149 }
3150 }
3151
3152 /*
3153 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3154 * segments have a smaller szc than we want to set. For each such area,
3155 * it calls as_iset2_default_lpsize()
3156 */
3157 static int
3158 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3159 uint_t szcvec)
3160 {
3161 struct seg *seg;
3162 size_t ssize;
3163 caddr_t setaddr = raddr;
3164 size_t setsize = 0;
3165 int set;
3166 int error;
3167
3168 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3169
3170 seg = as_segat(as, raddr);
3171 if (seg == NULL) {
3172 panic("as_iset1_default_lpsize: no seg");
3173 }
3174 if (seg->s_szc < szc) {
3175 set = 1;
3176 } else {
3177 set = 0;
3178 }
3179
3180 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3181 if (raddr >= seg->s_base + seg->s_size) {
3182 seg = AS_SEGNEXT(as, seg);
3183 if (seg == NULL || raddr != seg->s_base) {
3184 panic("as_iset1_default_lpsize: as changed");
3185 }
3186 if (seg->s_szc >= szc && set) {
3187 ASSERT(setsize != 0);
3188 error = as_iset2_default_lpsize(as,
3216 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3217 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3218 * chunk to as_iset1_default_lpsize().
3219 */
3220 static int
3221 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3222 int type)
3223 {
3224 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3225 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3226 flags, rtype, 1);
3227 uint_t szc;
3228 uint_t nszc;
3229 int error;
3230 caddr_t a;
3231 caddr_t eaddr;
3232 size_t segsize;
3233 size_t pgsz;
3234 uint_t save_szcvec;
3235
3236 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3237 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3238 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3239
3240 szcvec &= ~1;
3241 if (szcvec <= 1) { /* skip if base page size */
3242 return (0);
3243 }
3244
3245 /* Get the pagesize of the first larger page size. */
3246 szc = lowbit(szcvec) - 1;
3247 pgsz = page_get_pagesize(szc);
3248 eaddr = addr + size;
3249 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3250 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3251
3252 save_szcvec = szcvec;
3253 szcvec >>= (szc + 1);
3254 nszc = szc;
3255 while (szcvec) {
3256 if ((szcvec & 0x1) == 0) {
3308 * chunks with the same type/flags, ignores-non segvn segments, and passes
3309 * each chunk to as_iset_default_lpsize().
3310 */
3311 int
3312 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3313 {
3314 struct seg *seg;
3315 caddr_t raddr;
3316 size_t rsize;
3317 size_t ssize;
3318 int rtype, rflags;
3319 int stype, sflags;
3320 int error;
3321 caddr_t setaddr;
3322 size_t setsize;
3323 int segvn;
3324
3325 if (size == 0)
3326 return (0);
3327
3328 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3329 again:
3330 error = 0;
3331
3332 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3333 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3334 (size_t)raddr;
3335
3336 if (raddr + rsize < raddr) { /* check for wraparound */
3337 AS_LOCK_EXIT(as, &as->a_lock);
3338 return (ENOMEM);
3339 }
3340 as_clearwatchprot(as, raddr, rsize);
3341 seg = as_segat(as, raddr);
3342 if (seg == NULL) {
3343 as_setwatch(as);
3344 AS_LOCK_EXIT(as, &as->a_lock);
3345 return (ENOMEM);
3346 }
3347 if (seg->s_ops == &segvn_ops) {
3348 rtype = SEGOP_GETTYPE(seg, addr);
3349 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3350 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3351 segvn = 1;
3352 } else {
3353 segvn = 0;
3354 }
3355 setaddr = raddr;
3356 setsize = 0;
3357
3358 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3359 if (raddr >= (seg->s_base + seg->s_size)) {
3360 seg = AS_SEGNEXT(as, seg);
3361 if (seg == NULL || raddr != seg->s_base) {
3362 error = ENOMEM;
3363 break;
3364 }
3409 if (error == 0 && segvn) {
3410 /* The last chunk when rsize == 0. */
3411 ASSERT(setsize != 0);
3412 error = as_iset_default_lpsize(as, setaddr, setsize,
3413 rflags, rtype);
3414 }
3415
3416 if (error == IE_RETRY) {
3417 goto again;
3418 } else if (error == IE_NOMEM) {
3419 error = EAGAIN;
3420 } else if (error == ENOTSUP) {
3421 error = EINVAL;
3422 } else if (error == EAGAIN) {
3423 mutex_enter(&as->a_contents);
3424 if (!AS_ISNOUNMAPWAIT(as)) {
3425 if (AS_ISUNMAPWAIT(as) == 0) {
3426 cv_broadcast(&as->a_cv);
3427 }
3428 AS_SETUNMAPWAIT(as);
3429 AS_LOCK_EXIT(as, &as->a_lock);
3430 while (AS_ISUNMAPWAIT(as)) {
3431 cv_wait(&as->a_cv, &as->a_contents);
3432 }
3433 mutex_exit(&as->a_contents);
3434 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3435 } else {
3436 /*
3437 * We may have raced with
3438 * segvn_reclaim()/segspt_reclaim(). In this case
3439 * clean nounmapwait flag and retry since softlockcnt
3440 * in this segment may be already 0. We don't drop as
3441 * writer lock so our number of retries without
3442 * sleeping should be very small. See segvn_reclaim()
3443 * for more comments.
3444 */
3445 AS_CLRNOUNMAPWAIT(as);
3446 mutex_exit(&as->a_contents);
3447 }
3448 goto again;
3449 }
3450
3451 as_setwatch(as);
3452 AS_LOCK_EXIT(as, &as->a_lock);
3453 return (error);
3454 }
3455
3456 /*
3457 * Setup all of the uninitialized watched pages that we can.
3458 */
3459 void
3460 as_setwatch(struct as *as)
3461 {
3462 struct watched_page *pwp;
3463 struct seg *seg;
3464 caddr_t vaddr;
3465 uint_t prot;
3466 int err, retrycnt;
3467
3468 if (avl_numnodes(&as->a_wpage) == 0)
3469 return;
3470
3471 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3472
3473 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3474 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3475 retrycnt = 0;
3476 retry:
3477 vaddr = pwp->wp_vaddr;
3478 if (pwp->wp_oprot != 0 || /* already set up */
3479 (seg = as_segat(as, vaddr)) == NULL ||
3480 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3481 continue;
3482
3483 pwp->wp_oprot = prot;
3484 if (pwp->wp_read)
3485 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3486 if (pwp->wp_write)
3487 prot &= ~PROT_WRITE;
3488 if (pwp->wp_exec)
3489 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3490 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3491 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3498 }
3499 pwp->wp_prot = prot;
3500 }
3501 }
3502
3503 /*
3504 * Clear all of the watched pages in the address space.
3505 */
3506 void
3507 as_clearwatch(struct as *as)
3508 {
3509 struct watched_page *pwp;
3510 struct seg *seg;
3511 caddr_t vaddr;
3512 uint_t prot;
3513 int err, retrycnt;
3514
3515 if (avl_numnodes(&as->a_wpage) == 0)
3516 return;
3517
3518 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3519
3520 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3521 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3522 retrycnt = 0;
3523 retry:
3524 vaddr = pwp->wp_vaddr;
3525 if (pwp->wp_oprot == 0 || /* not set up */
3526 (seg = as_segat(as, vaddr)) == NULL)
3527 continue;
3528
3529 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3530 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3531 if (err == IE_RETRY) {
3532 ASSERT(retrycnt == 0);
3533 retrycnt++;
3534 goto retry;
3535 }
3536 }
3537 pwp->wp_oprot = 0;
3538 pwp->wp_prot = 0;
3540 }
3541
3542 /*
3543 * Force a new setup for all the watched pages in the range.
3544 */
3545 static void
3546 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3547 {
3548 struct watched_page *pwp;
3549 struct watched_page tpw;
3550 caddr_t eaddr = addr + size;
3551 caddr_t vaddr;
3552 struct seg *seg;
3553 int err, retrycnt;
3554 uint_t wprot;
3555 avl_index_t where;
3556
3557 if (avl_numnodes(&as->a_wpage) == 0)
3558 return;
3559
3560 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3561
3562 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3563 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3564 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3565
3566 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3567 retrycnt = 0;
3568 vaddr = pwp->wp_vaddr;
3569
3570 wprot = prot;
3571 if (pwp->wp_read)
3572 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3573 if (pwp->wp_write)
3574 wprot &= ~PROT_WRITE;
3575 if (pwp->wp_exec)
3576 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3577 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3578 retry:
3579 seg = as_segat(as, vaddr);
3580 if (seg == NULL) {
3599 * Clear all of the watched pages in the range.
3600 */
3601 static void
3602 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3603 {
3604 caddr_t eaddr = addr + size;
3605 struct watched_page *pwp;
3606 struct watched_page tpw;
3607 uint_t prot;
3608 struct seg *seg;
3609 int err, retrycnt;
3610 avl_index_t where;
3611
3612 if (avl_numnodes(&as->a_wpage) == 0)
3613 return;
3614
3615 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3616 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3617 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3618
3619 ASSERT(AS_WRITE_HELD(as, &as->a_lock));
3620
3621 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3622
3623 if ((prot = pwp->wp_oprot) != 0) {
3624 retrycnt = 0;
3625
3626 if (prot != pwp->wp_prot) {
3627 retry:
3628 seg = as_segat(as, pwp->wp_vaddr);
3629 if (seg == NULL)
3630 continue;
3631 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3632 PAGESIZE, prot);
3633 if (err == IE_RETRY) {
3634 ASSERT(retrycnt == 0);
3635 retrycnt++;
3636 goto retry;
3637
3638 }
3639 }
3654 for (p = practive; p; p = p->p_next) {
3655 if (p->p_as == as) {
3656 mutex_enter(&p->p_lock);
3657 if (p->p_as == as)
3658 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3659 mutex_exit(&p->p_lock);
3660 }
3661 }
3662 mutex_exit(&pidlock);
3663 }
3664
3665 /*
3666 * return memory object ID
3667 */
3668 int
3669 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3670 {
3671 struct seg *seg;
3672 int sts;
3673
3674 AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
3675 seg = as_segat(as, addr);
3676 if (seg == NULL) {
3677 AS_LOCK_EXIT(as, &as->a_lock);
3678 return (EFAULT);
3679 }
3680 /*
3681 * catch old drivers which may not support getmemid
3682 */
3683 if (seg->s_ops->getmemid == NULL) {
3684 AS_LOCK_EXIT(as, &as->a_lock);
3685 return (ENODEV);
3686 }
3687
3688 sts = SEGOP_GETMEMID(seg, addr, memidp);
3689
3690 AS_LOCK_EXIT(as, &as->a_lock);
3691 return (sts);
3692 }
|
343 return (0);
344 }
345
346 /*
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
352 * list is returned.
353 *
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
356 */
357 struct seg *
358 as_findseg(struct as *as, caddr_t addr, int tail)
359 {
360 struct seg *seg = as->a_seglast;
361 avl_index_t where;
362
363 ASSERT(AS_LOCK_HELD(as));
364
365 if (seg != NULL &&
366 seg->s_base <= addr &&
367 addr < seg->s_base + seg->s_size)
368 return (seg);
369
370 seg = avl_find(&as->a_segtree, &addr, &where);
371 if (seg != NULL)
372 return (as->a_seglast = seg);
373
374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 if (seg == NULL && tail)
376 seg = avl_last(&as->a_segtree);
377 return (as->a_seglast = seg);
378 }
379
380 #ifdef VERIFY_SEGLIST
381 /*
382 * verify that the linked list is coherent
383 */
405 nsegs++;
406 }
407 ASSERT(seglast == NULL);
408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
409 }
410 #endif /* VERIFY_SEGLIST */
411
412 /*
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
416 */
417 int
418 as_addseg(struct as *as, struct seg *newseg)
419 {
420 struct seg *seg;
421 caddr_t addr;
422 caddr_t eaddr;
423 avl_index_t where;
424
425 ASSERT(AS_WRITE_HELD(as));
426
427 as->a_updatedir = 1; /* inform /proc */
428 gethrestime(&as->a_updatetime);
429
430 if (as->a_lastgaphl != NULL) {
431 struct seg *hseg = NULL;
432 struct seg *lseg = NULL;
433
434 if (as->a_lastgaphl->s_base > newseg->s_base) {
435 hseg = as->a_lastgaphl;
436 lseg = AVL_PREV(&as->a_segtree, hseg);
437 } else {
438 lseg = as->a_lastgaphl;
439 hseg = AVL_NEXT(&as->a_segtree, lseg);
440 }
441
442 if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 hseg->s_base > newseg->s_base) {
444 avl_insert_here(&as->a_segtree, newseg, lseg,
445 AVL_AFTER);
487 }
488 #endif
489 return (-1); /* overlapping segment */
490 }
491 }
492 }
493 as->a_seglast = newseg;
494 avl_insert(&as->a_segtree, newseg, where);
495
496 #ifdef VERIFY_SEGLIST
497 as_verify(as);
498 #endif
499 return (0);
500 }
501
502 struct seg *
503 as_removeseg(struct as *as, struct seg *seg)
504 {
505 avl_tree_t *t;
506
507 ASSERT(AS_WRITE_HELD(as));
508
509 as->a_updatedir = 1; /* inform /proc */
510 gethrestime(&as->a_updatetime);
511
512 if (seg == NULL)
513 return (NULL);
514
515 t = &as->a_segtree;
516 if (as->a_seglast == seg)
517 as->a_seglast = NULL;
518 as->a_lastgaphl = NULL;
519
520 /*
521 * if this segment is at an address higher than
522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
523 */
524 if (as->a_lastgap &&
525 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
526 as->a_lastgap = AVL_NEXT(t, seg);
527
528 /*
529 * remove the segment from the seg tree
530 */
531 avl_remove(t, seg);
532
533 #ifdef VERIFY_SEGLIST
534 as_verify(as);
535 #endif
536 return (seg);
537 }
538
539 /*
540 * Find a segment containing addr.
541 */
542 struct seg *
543 as_segat(struct as *as, caddr_t addr)
544 {
545 struct seg *seg = as->a_seglast;
546
547 ASSERT(AS_LOCK_HELD(as));
548
549 if (seg != NULL && seg->s_base <= addr &&
550 addr < seg->s_base + seg->s_size)
551 return (seg);
552
553 seg = avl_find(&as->a_segtree, &addr, NULL);
554 return (seg);
555 }
556
557 /*
558 * Serialize all searches for holes in an address space to
559 * prevent two or more threads from allocating the same virtual
560 * address range. The address space must not be "read/write"
561 * locked by the caller since we may block.
562 */
563 void
564 as_rangelock(struct as *as)
565 {
566 mutex_enter(&as->a_contents);
567 while (AS_ISCLAIMGAP(as))
650 {
651 struct as *as;
652
653 as = kmem_cache_alloc(as_cache, KM_SLEEP);
654
655 as->a_flags = 0;
656 as->a_vbits = 0;
657 as->a_hrm = NULL;
658 as->a_seglast = NULL;
659 as->a_size = 0;
660 as->a_resvsize = 0;
661 as->a_updatedir = 0;
662 gethrestime(&as->a_updatetime);
663 as->a_objectdir = NULL;
664 as->a_sizedir = 0;
665 as->a_userlimit = (caddr_t)USERLIMIT;
666 as->a_lastgap = NULL;
667 as->a_lastgaphl = NULL;
668 as->a_callbacks = NULL;
669
670 AS_LOCK_ENTER(as, RW_WRITER);
671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
672 AS_LOCK_EXIT(as);
673
674 as->a_xhat = NULL;
675
676 return (as);
677 }
678
679 /*
680 * Free an address space data structure.
681 * Need to free the hat first and then
682 * all the segments on this as and finally
683 * the space for the as struct itself.
684 */
685 void
686 as_free(struct as *as)
687 {
688 struct hat *hat = as->a_hat;
689 struct seg *seg, *next;
690 int called = 0;
691
692 top:
693 /*
694 * Invoke ALL callbacks. as_do_callbacks will do one callback
695 * per call, and not return (-1) until the callback has completed.
696 * When as_do_callbacks returns zero, all callbacks have completed.
697 */
698 mutex_enter(&as->a_contents);
699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
700 ;
701
702 /* This will prevent new XHATs from attaching to as */
703 if (!called)
704 AS_SETBUSY(as);
705 mutex_exit(&as->a_contents);
706 AS_LOCK_ENTER(as, RW_WRITER);
707
708 if (!called) {
709 called = 1;
710 hat_free_start(hat);
711 if (as->a_xhat != NULL)
712 xhat_free_start_all(as);
713 }
714 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
715 int err;
716
717 next = AS_SEGNEXT(as, seg);
718 retry:
719 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
720 if (err == EAGAIN) {
721 mutex_enter(&as->a_contents);
722 if (as->a_callbacks) {
723 AS_LOCK_EXIT(as);
724 } else if (!AS_ISNOUNMAPWAIT(as)) {
725 /*
726 * Memory is currently locked. Wait for a
727 * cv_signal that it has been unlocked, then
728 * try the operation again.
729 */
730 if (AS_ISUNMAPWAIT(as) == 0)
731 cv_broadcast(&as->a_cv);
732 AS_SETUNMAPWAIT(as);
733 AS_LOCK_EXIT(as);
734 while (AS_ISUNMAPWAIT(as))
735 cv_wait(&as->a_cv, &as->a_contents);
736 } else {
737 /*
738 * We may have raced with
739 * segvn_reclaim()/segspt_reclaim(). In this
740 * case clean nounmapwait flag and retry since
741 * softlockcnt in this segment may be already
742 * 0. We don't drop as writer lock so our
743 * number of retries without sleeping should
744 * be very small. See segvn_reclaim() for
745 * more comments.
746 */
747 AS_CLRNOUNMAPWAIT(as);
748 mutex_exit(&as->a_contents);
749 goto retry;
750 }
751 mutex_exit(&as->a_contents);
752 goto top;
753 } else {
754 /*
755 * We do not expect any other error return at this
756 * time. This is similar to an ASSERT in seg_unmap()
757 */
758 ASSERT(err == 0);
759 }
760 }
761 hat_free_end(hat);
762 if (as->a_xhat != NULL)
763 xhat_free_end_all(as);
764 AS_LOCK_EXIT(as);
765
766 /* /proc stuff */
767 ASSERT(avl_numnodes(&as->a_wpage) == 0);
768 if (as->a_objectdir) {
769 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
770 as->a_objectdir = NULL;
771 as->a_sizedir = 0;
772 }
773
774 /*
775 * Free the struct as back to kmem. Assert it has no segments.
776 */
777 ASSERT(avl_numnodes(&as->a_segtree) == 0);
778 kmem_cache_free(as_cache, as);
779 }
780
781 int
782 as_dup(struct as *as, struct proc *forkedproc)
783 {
784 struct as *newas;
785 struct seg *seg, *newseg;
786 size_t purgesize = 0;
787 int error;
788
789 AS_LOCK_ENTER(as, RW_WRITER);
790 as_clearwatch(as);
791 newas = as_alloc();
792 newas->a_userlimit = as->a_userlimit;
793 newas->a_proc = forkedproc;
794
795 AS_LOCK_ENTER(newas, RW_WRITER);
796
797 /* This will prevent new XHATs from attaching */
798 mutex_enter(&as->a_contents);
799 AS_SETBUSY(as);
800 mutex_exit(&as->a_contents);
801 mutex_enter(&newas->a_contents);
802 AS_SETBUSY(newas);
803 mutex_exit(&newas->a_contents);
804
805 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
806
807 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
808
809 if (seg->s_flags & S_PURGE) {
810 purgesize += seg->s_size;
811 continue;
812 }
813
814 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
815 if (newseg == NULL) {
816 AS_LOCK_EXIT(newas);
817 as_setwatch(as);
818 mutex_enter(&as->a_contents);
819 AS_CLRBUSY(as);
820 mutex_exit(&as->a_contents);
821 AS_LOCK_EXIT(as);
822 as_free(newas);
823 return (-1);
824 }
825 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
826 /*
827 * We call seg_free() on the new seg
828 * because the segment is not set up
829 * completely; i.e. it has no ops.
830 */
831 as_setwatch(as);
832 mutex_enter(&as->a_contents);
833 AS_CLRBUSY(as);
834 mutex_exit(&as->a_contents);
835 AS_LOCK_EXIT(as);
836 seg_free(newseg);
837 AS_LOCK_EXIT(newas);
838 as_free(newas);
839 return (error);
840 }
841 newas->a_size += seg->s_size;
842 }
843 newas->a_resvsize = as->a_resvsize - purgesize;
844
845 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
846 if (as->a_xhat != NULL)
847 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
848
849 mutex_enter(&newas->a_contents);
850 AS_CLRBUSY(newas);
851 mutex_exit(&newas->a_contents);
852 AS_LOCK_EXIT(newas);
853
854 as_setwatch(as);
855 mutex_enter(&as->a_contents);
856 AS_CLRBUSY(as);
857 mutex_exit(&as->a_contents);
858 AS_LOCK_EXIT(as);
859 if (error != 0) {
860 as_free(newas);
861 return (error);
862 }
863 forkedproc->p_as = newas;
864 return (0);
865 }
866
867 /*
868 * Handle a ``fault'' at addr for size bytes.
869 */
870 faultcode_t
871 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
872 enum fault_type type, enum seg_rw rw)
873 {
874 struct seg *seg;
875 caddr_t raddr; /* rounded down addr */
876 size_t rsize; /* rounded up size */
877 size_t ssize;
878 faultcode_t res = 0;
942 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
943 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
944 (size_t)raddr;
945
946 /*
947 * XXX -- Don't grab the as lock for segkmap. We should grab it for
948 * correctness, but then we could be stuck holding this lock for
949 * a LONG time if the fault needs to be resolved on a slow
950 * filesystem, and then no-one will be able to exec new commands,
951 * as exec'ing requires the write lock on the as.
952 */
953 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
954 raddr + size < segkmap->s_base + segkmap->s_size) {
955 /*
956 * if (as==&kas), this can't be XHAT: we've already returned
957 * FC_NOSUPPORT.
958 */
959 seg = segkmap;
960 as_lock_held = 0;
961 } else {
962 AS_LOCK_ENTER(as, RW_READER);
963 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
964 /*
965 * Grab and hold the writers' lock on the as
966 * if the fault is to a watched page.
967 * This will keep CPUs from "peeking" at the
968 * address range while we're temporarily boosting
969 * the permissions for the XHAT device to
970 * resolve the fault in the segment layer.
971 *
972 * We could check whether faulted address
973 * is within a watched page and only then grab
974 * the writer lock, but this is simpler.
975 */
976 AS_LOCK_EXIT(as);
977 AS_LOCK_ENTER(as, RW_WRITER);
978 }
979
980 seg = as_segat(as, raddr);
981 if (seg == NULL) {
982 AS_LOCK_EXIT(as);
983 if ((lwp != NULL) && (!is_xhat))
984 lwp->lwp_nostop--;
985 return (FC_NOMAP);
986 }
987
988 as_lock_held = 1;
989 }
990
991 addrsav = raddr;
992 segsav = seg;
993
994 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
995 if (raddr >= seg->s_base + seg->s_size) {
996 seg = AS_SEGNEXT(as, seg);
997 if (seg == NULL || raddr != seg->s_base) {
998 res = FC_NOMAP;
999 break;
1000 }
1001 }
1002 if (raddr + rsize > seg->s_base + seg->s_size)
1043 */
1044 if (res != 0 && type == F_SOFTLOCK) {
1045 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
1046 if (addrsav >= seg->s_base + seg->s_size)
1047 seg = AS_SEGNEXT(as, seg);
1048 ASSERT(seg != NULL);
1049 /*
1050 * Now call the fault routine again to perform the
1051 * unlock using S_OTHER instead of the rw variable
1052 * since we never got a chance to touch the pages.
1053 */
1054 if (raddr > seg->s_base + seg->s_size)
1055 ssize = seg->s_base + seg->s_size - addrsav;
1056 else
1057 ssize = raddr - addrsav;
1058 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
1059 F_SOFTUNLOCK, S_OTHER);
1060 }
1061 }
1062 if (as_lock_held)
1063 AS_LOCK_EXIT(as);
1064 if ((lwp != NULL) && (!is_xhat))
1065 lwp->lwp_nostop--;
1066
1067 /*
1068 * If the lower levels returned EDEADLK for a fault,
1069 * It means that we should retry the fault. Let's wait
1070 * a bit also to let the deadlock causing condition clear.
1071 * This is part of a gross hack to work around a design flaw
1072 * in the ufs/sds logging code and should go away when the
1073 * logging code is re-designed to fix the problem. See bug
1074 * 4125102 for details of the problem.
1075 */
1076 if (FC_ERRNO(res) == EDEADLK) {
1077 delay(deadlk_wait);
1078 res = 0;
1079 goto retry;
1080 }
1081 return (res);
1082 }
1083
1091 {
1092 struct seg *seg;
1093 caddr_t raddr; /* rounded down addr */
1094 size_t rsize; /* rounded up size */
1095 faultcode_t res = 0;
1096 klwp_t *lwp = ttolwp(curthread);
1097
1098 retry:
1099 /*
1100 * Indicate that the lwp is not to be stopped while waiting
1101 * for a pagefault. This is to avoid deadlock while debugging
1102 * a process via /proc over NFS (in particular).
1103 */
1104 if (lwp != NULL)
1105 lwp->lwp_nostop++;
1106
1107 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1108 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1109 (size_t)raddr;
1110
1111 AS_LOCK_ENTER(as, RW_READER);
1112 seg = as_segat(as, raddr);
1113 if (seg == NULL) {
1114 AS_LOCK_EXIT(as);
1115 if (lwp != NULL)
1116 lwp->lwp_nostop--;
1117 return (FC_NOMAP);
1118 }
1119
1120 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1121 if (raddr >= seg->s_base + seg->s_size) {
1122 seg = AS_SEGNEXT(as, seg);
1123 if (seg == NULL || raddr != seg->s_base) {
1124 res = FC_NOMAP;
1125 break;
1126 }
1127 }
1128 res = SEGOP_FAULTA(seg, raddr);
1129 if (res != 0)
1130 break;
1131 }
1132 AS_LOCK_EXIT(as);
1133 if (lwp != NULL)
1134 lwp->lwp_nostop--;
1135 /*
1136 * If the lower levels returned EDEADLK for a fault,
1137 * It means that we should retry the fault. Let's wait
1138 * a bit also to let the deadlock causing condition clear.
1139 * This is part of a gross hack to work around a design flaw
1140 * in the ufs/sds logging code and should go away when the
1141 * logging code is re-designed to fix the problem. See bug
1142 * 4125102 for details of the problem.
1143 */
1144 if (FC_ERRNO(res) == EDEADLK) {
1145 delay(deadlk_wait);
1146 res = 0;
1147 goto retry;
1148 }
1149 return (res);
1150 }
1151
1152 /*
1172 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1173 (size_t)raddr;
1174
1175 if (raddr + rsize < raddr) /* check for wraparound */
1176 return (ENOMEM);
1177
1178 saveraddr = raddr;
1179 saversize = rsize;
1180
1181 /*
1182 * Normally we only lock the as as a reader. But
1183 * if due to setprot the segment driver needs to split
1184 * a segment it will return IE_RETRY. Therefore we re-acquire
1185 * the as lock as a writer so the segment driver can change
1186 * the seg list. Also the segment driver will return IE_RETRY
1187 * after it has changed the segment list so we therefore keep
1188 * locking as a writer. Since these opeartions should be rare
1189 * want to only lock as a writer when necessary.
1190 */
1191 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1192 AS_LOCK_ENTER(as, RW_WRITER);
1193 } else {
1194 AS_LOCK_ENTER(as, RW_READER);
1195 }
1196
1197 as_clearwatchprot(as, raddr, rsize);
1198 seg = as_segat(as, raddr);
1199 if (seg == NULL) {
1200 as_setwatch(as);
1201 AS_LOCK_EXIT(as);
1202 return (ENOMEM);
1203 }
1204
1205 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1206 if (raddr >= seg->s_base + seg->s_size) {
1207 seg = AS_SEGNEXT(as, seg);
1208 if (seg == NULL || raddr != seg->s_base) {
1209 error = ENOMEM;
1210 break;
1211 }
1212 }
1213 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1214 ssize = seg->s_base + seg->s_size - raddr;
1215 else
1216 ssize = rsize;
1217 retry:
1218 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1219
1220 if (error == IE_NOMEM) {
1221 error = EAGAIN;
1222 break;
1223 }
1224
1225 if (error == IE_RETRY) {
1226 AS_LOCK_EXIT(as);
1227 writer = 1;
1228 goto setprot_top;
1229 }
1230
1231 if (error == EAGAIN) {
1232 /*
1233 * Make sure we have a_lock as writer.
1234 */
1235 if (writer == 0) {
1236 AS_LOCK_EXIT(as);
1237 writer = 1;
1238 goto setprot_top;
1239 }
1240
1241 /*
1242 * Memory is currently locked. It must be unlocked
1243 * before this operation can succeed through a retry.
1244 * The possible reasons for locked memory and
1245 * corresponding strategies for unlocking are:
1246 * (1) Normal I/O
1247 * wait for a signal that the I/O operation
1248 * has completed and the memory is unlocked.
1249 * (2) Asynchronous I/O
1250 * The aio subsystem does not unlock pages when
1251 * the I/O is completed. Those pages are unlocked
1252 * when the application calls aiowait/aioerror.
1253 * So, to prevent blocking forever, cv_broadcast()
1254 * is done to wake up aio_cleanup_thread.
1255 * Subsequently, segvn_reclaim will be called, and
1256 * that will do AS_CLRUNMAPWAIT() and wake us up.
1257 * (3) Long term page locking:
1258 * Drivers intending to have pages locked for a
1259 * period considerably longer than for normal I/O
1260 * (essentially forever) may have registered for a
1261 * callback so they may unlock these pages on
1262 * request. This is needed to allow this operation
1263 * to succeed. Each entry on the callback list is
1264 * examined. If the event or address range pertains
1265 * the callback is invoked (unless it already is in
1266 * progress). The a_contents lock must be dropped
1267 * before the callback, so only one callback can
1268 * be done at a time. Go to the top and do more
1269 * until zero is returned. If zero is returned,
1270 * either there were no callbacks for this event
1271 * or they were already in progress.
1272 */
1273 mutex_enter(&as->a_contents);
1274 if (as->a_callbacks &&
1275 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1276 seg->s_base, seg->s_size))) {
1277 AS_LOCK_EXIT(as);
1278 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1279 } else if (!AS_ISNOUNMAPWAIT(as)) {
1280 if (AS_ISUNMAPWAIT(as) == 0)
1281 cv_broadcast(&as->a_cv);
1282 AS_SETUNMAPWAIT(as);
1283 AS_LOCK_EXIT(as);
1284 while (AS_ISUNMAPWAIT(as))
1285 cv_wait(&as->a_cv, &as->a_contents);
1286 } else {
1287 /*
1288 * We may have raced with
1289 * segvn_reclaim()/segspt_reclaim(). In this
1290 * case clean nounmapwait flag and retry since
1291 * softlockcnt in this segment may be already
1292 * 0. We don't drop as writer lock so our
1293 * number of retries without sleeping should
1294 * be very small. See segvn_reclaim() for
1295 * more comments.
1296 */
1297 AS_CLRNOUNMAPWAIT(as);
1298 mutex_exit(&as->a_contents);
1299 goto retry;
1300 }
1301 mutex_exit(&as->a_contents);
1302 goto setprot_top;
1303 } else if (error != 0)
1304 break;
1305 }
1306 if (error != 0) {
1307 as_setwatch(as);
1308 } else {
1309 as_setwatchprot(as, saveraddr, saversize, prot);
1310 }
1311 AS_LOCK_EXIT(as);
1312 return (error);
1313 }
1314
1315 /*
1316 * Check to make sure that the interval [addr, addr + size)
1317 * in address space `as' has at least the specified protection.
1318 * It is ok for the range to cross over several segments, as long
1319 * as they are contiguous.
1320 */
1321 int
1322 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1323 {
1324 struct seg *seg;
1325 size_t ssize;
1326 caddr_t raddr; /* rounded down addr */
1327 size_t rsize; /* rounded up size */
1328 int error = 0;
1329
1330 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1331 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1332 (size_t)raddr;
1333
1334 if (raddr + rsize < raddr) /* check for wraparound */
1335 return (ENOMEM);
1336
1337 /*
1338 * This is ugly as sin...
1339 * Normally, we only acquire the address space readers lock.
1340 * However, if the address space has watchpoints present,
1341 * we must acquire the writer lock on the address space for
1342 * the benefit of as_clearwatchprot() and as_setwatchprot().
1343 */
1344 if (avl_numnodes(&as->a_wpage) != 0)
1345 AS_LOCK_ENTER(as, RW_WRITER);
1346 else
1347 AS_LOCK_ENTER(as, RW_READER);
1348 as_clearwatchprot(as, raddr, rsize);
1349 seg = as_segat(as, raddr);
1350 if (seg == NULL) {
1351 as_setwatch(as);
1352 AS_LOCK_EXIT(as);
1353 return (ENOMEM);
1354 }
1355
1356 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1357 if (raddr >= seg->s_base + seg->s_size) {
1358 seg = AS_SEGNEXT(as, seg);
1359 if (seg == NULL || raddr != seg->s_base) {
1360 error = ENOMEM;
1361 break;
1362 }
1363 }
1364 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1365 ssize = seg->s_base + seg->s_size - raddr;
1366 else
1367 ssize = rsize;
1368
1369 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1370 if (error != 0)
1371 break;
1372 }
1373 as_setwatch(as);
1374 AS_LOCK_EXIT(as);
1375 return (error);
1376 }
1377
1378 int
1379 as_unmap(struct as *as, caddr_t addr, size_t size)
1380 {
1381 struct seg *seg, *seg_next;
1382 struct as_callback *cb;
1383 caddr_t raddr, eaddr;
1384 size_t ssize, rsize = 0;
1385 int err;
1386
1387 top:
1388 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1389 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1390 (uintptr_t)PAGEMASK);
1391
1392 AS_LOCK_ENTER(as, RW_WRITER);
1393
1394 as->a_updatedir = 1; /* inform /proc */
1395 gethrestime(&as->a_updatetime);
1396
1397 /*
1398 * Use as_findseg to find the first segment in the range, then
1399 * step through the segments in order, following s_next.
1400 */
1401 as_clearwatchprot(as, raddr, eaddr - raddr);
1402
1403 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1404 if (eaddr <= seg->s_base)
1405 break; /* eaddr was in a gap; all done */
1406
1407 /* this is implied by the test above */
1408 ASSERT(raddr < eaddr);
1409
1410 if (raddr < seg->s_base)
1411 raddr = seg->s_base; /* raddr was in a gap */
1412
1453 * (3) Long term page locking:
1454 * Drivers intending to have pages locked for a
1455 * period considerably longer than for normal I/O
1456 * (essentially forever) may have registered for a
1457 * callback so they may unlock these pages on
1458 * request. This is needed to allow this operation
1459 * to succeed. Each entry on the callback list is
1460 * examined. If the event or address range pertains
1461 * the callback is invoked (unless it already is in
1462 * progress). The a_contents lock must be dropped
1463 * before the callback, so only one callback can
1464 * be done at a time. Go to the top and do more
1465 * until zero is returned. If zero is returned,
1466 * either there were no callbacks for this event
1467 * or they were already in progress.
1468 */
1469 mutex_enter(&as->a_contents);
1470 if (as->a_callbacks &&
1471 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1472 seg->s_base, seg->s_size))) {
1473 AS_LOCK_EXIT(as);
1474 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1475 } else if (!AS_ISNOUNMAPWAIT(as)) {
1476 if (AS_ISUNMAPWAIT(as) == 0)
1477 cv_broadcast(&as->a_cv);
1478 AS_SETUNMAPWAIT(as);
1479 AS_LOCK_EXIT(as);
1480 while (AS_ISUNMAPWAIT(as))
1481 cv_wait(&as->a_cv, &as->a_contents);
1482 } else {
1483 /*
1484 * We may have raced with
1485 * segvn_reclaim()/segspt_reclaim(). In this
1486 * case clean nounmapwait flag and retry since
1487 * softlockcnt in this segment may be already
1488 * 0. We don't drop as writer lock so our
1489 * number of retries without sleeping should
1490 * be very small. See segvn_reclaim() for
1491 * more comments.
1492 */
1493 AS_CLRNOUNMAPWAIT(as);
1494 mutex_exit(&as->a_contents);
1495 goto retry;
1496 }
1497 mutex_exit(&as->a_contents);
1498 goto top;
1499 } else if (err == IE_RETRY) {
1500 AS_LOCK_EXIT(as);
1501 goto top;
1502 } else if (err) {
1503 as_setwatch(as);
1504 AS_LOCK_EXIT(as);
1505 return (-1);
1506 }
1507
1508 as->a_size -= ssize;
1509 if (rsize)
1510 as->a_resvsize -= rsize;
1511 raddr += ssize;
1512 }
1513 AS_LOCK_EXIT(as);
1514 return (0);
1515 }
1516
1517 static int
1518 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1519 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1520 {
1521 uint_t szc;
1522 uint_t nszc;
1523 int error;
1524 caddr_t a;
1525 caddr_t eaddr;
1526 size_t segsize;
1527 struct seg *seg;
1528 size_t pgsz;
1529 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1530 uint_t save_szcvec;
1531
1532 ASSERT(AS_WRITE_HELD(as));
1533 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1534 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1535 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1536 if (!do_off) {
1537 vn_a->offset = 0;
1538 }
1539
1540 if (szcvec <= 1) {
1541 seg = seg_alloc(as, addr, size);
1542 if (seg == NULL) {
1543 return (ENOMEM);
1544 }
1545 vn_a->szc = 0;
1546 error = (*crfp)(seg, vn_a);
1547 if (error != 0) {
1548 seg_free(seg);
1549 } else {
1550 as->a_size += size;
1551 as->a_resvsize += size;
1552 }
1626 ASSERT(addr == eaddr);
1627
1628 return (0);
1629 }
1630
1631 static int
1632 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1633 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1634 {
1635 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1636 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1637 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1638 type, 0);
1639 int error;
1640 struct seg *seg;
1641 struct vattr va;
1642 u_offset_t eoff;
1643 size_t save_size = 0;
1644 extern size_t textrepl_size_thresh;
1645
1646 ASSERT(AS_WRITE_HELD(as));
1647 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 ASSERT(vn_a->vp != NULL);
1650 ASSERT(vn_a->amp == NULL);
1651
1652 again:
1653 if (szcvec <= 1) {
1654 seg = seg_alloc(as, addr, size);
1655 if (seg == NULL) {
1656 return (ENOMEM);
1657 }
1658 vn_a->szc = 0;
1659 error = (*crfp)(seg, vn_a);
1660 if (error != 0) {
1661 seg_free(seg);
1662 } else {
1663 as->a_size += size;
1664 as->a_resvsize += size;
1665 }
1666 return (error);
1715 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1716 {
1717 uint_t szcvec;
1718 uchar_t type;
1719
1720 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1721 if (vn_a->type == MAP_SHARED) {
1722 type = MAPPGSZC_SHM;
1723 } else if (vn_a->type == MAP_PRIVATE) {
1724 if (vn_a->szc == AS_MAP_HEAP) {
1725 type = MAPPGSZC_HEAP;
1726 } else if (vn_a->szc == AS_MAP_STACK) {
1727 type = MAPPGSZC_STACK;
1728 } else {
1729 type = MAPPGSZC_PRIVM;
1730 }
1731 }
1732 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1733 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1734 (vn_a->flags & MAP_TEXT), type, 0);
1735 ASSERT(AS_WRITE_HELD(as));
1736 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1737 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1738 ASSERT(vn_a->vp == NULL);
1739
1740 return (as_map_segvn_segs(as, addr, size, szcvec,
1741 crfp, vn_a, segcreated));
1742 }
1743
1744 int
1745 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1746 {
1747 AS_LOCK_ENTER(as, RW_WRITER);
1748 return (as_map_locked(as, addr, size, crfp, argsp));
1749 }
1750
1751 int
1752 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1753 void *argsp)
1754 {
1755 struct seg *seg = NULL;
1756 caddr_t raddr; /* rounded down addr */
1757 size_t rsize; /* rounded up size */
1758 int error;
1759 int unmap = 0;
1760 struct proc *p = curproc;
1761 struct segvn_crargs crargs;
1762
1763 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1764 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1765 (size_t)raddr;
1766
1767 /*
1768 * check for wrap around
1769 */
1770 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1771 AS_LOCK_EXIT(as);
1772 return (ENOMEM);
1773 }
1774
1775 as->a_updatedir = 1; /* inform /proc */
1776 gethrestime(&as->a_updatetime);
1777
1778 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1779 AS_LOCK_EXIT(as);
1780
1781 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1782 RCA_UNSAFE_ALL);
1783
1784 return (ENOMEM);
1785 }
1786
1787 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1788 crargs = *(struct segvn_crargs *)argsp;
1789 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1790 if (error != 0) {
1791 AS_LOCK_EXIT(as);
1792 if (unmap) {
1793 (void) as_unmap(as, addr, size);
1794 }
1795 return (error);
1796 }
1797 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1798 crargs = *(struct segvn_crargs *)argsp;
1799 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1800 if (error != 0) {
1801 AS_LOCK_EXIT(as);
1802 if (unmap) {
1803 (void) as_unmap(as, addr, size);
1804 }
1805 return (error);
1806 }
1807 } else {
1808 seg = seg_alloc(as, addr, size);
1809 if (seg == NULL) {
1810 AS_LOCK_EXIT(as);
1811 return (ENOMEM);
1812 }
1813
1814 error = (*crfp)(seg, argsp);
1815 if (error != 0) {
1816 seg_free(seg);
1817 AS_LOCK_EXIT(as);
1818 return (error);
1819 }
1820 /*
1821 * Add size now so as_unmap will work if as_ctl fails.
1822 */
1823 as->a_size += rsize;
1824 as->a_resvsize += rsize;
1825 }
1826
1827 as_setwatch(as);
1828
1829 /*
1830 * If the address space is locked,
1831 * establish memory locks for the new segment.
1832 */
1833 mutex_enter(&as->a_contents);
1834 if (AS_ISPGLCK(as)) {
1835 mutex_exit(&as->a_contents);
1836 AS_LOCK_EXIT(as);
1837 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1838 if (error != 0)
1839 (void) as_unmap(as, addr, size);
1840 } else {
1841 mutex_exit(&as->a_contents);
1842 AS_LOCK_EXIT(as);
1843 }
1844 return (error);
1845 }
1846
1847
1848 /*
1849 * Delete all segments in the address space marked with S_PURGE.
1850 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1851 * These segments are deleted as a first step before calls to as_gap(), so
1852 * that they don't affect mmap() or shmat().
1853 */
1854 void
1855 as_purge(struct as *as)
1856 {
1857 struct seg *seg;
1858 struct seg *next_seg;
1859
1860 /*
1861 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1862 * no need to grab a_contents mutex for this check
1863 */
1864 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1865 return;
1866
1867 AS_LOCK_ENTER(as, RW_WRITER);
1868 next_seg = NULL;
1869 seg = AS_SEGFIRST(as);
1870 while (seg != NULL) {
1871 next_seg = AS_SEGNEXT(as, seg);
1872 if (seg->s_flags & S_PURGE)
1873 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1874 seg = next_seg;
1875 }
1876 AS_LOCK_EXIT(as);
1877
1878 mutex_enter(&as->a_contents);
1879 as->a_flags &= ~AS_NEEDSPURGE;
1880 mutex_exit(&as->a_contents);
1881 }
1882
1883 /*
1884 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1885 * range of addresses at least "minlen" long, where the base of the range is
1886 * at "off" phase from an "align" boundary and there is space for a
1887 * "redzone"-sized redzone on eithe rside of the range. Thus,
1888 * if align was 4M and off was 16k, the user wants a hole which will start
1889 * 16k into a 4M page.
1890 *
1891 * If flags specifies AH_HI, the hole will have the highest possible address
1892 * in the range. We use the as->a_lastgap field to figure out where to
1893 * start looking for a gap.
1894 *
1895 * Otherwise, the gap will have the lowest possible address.
1896 *
1919 save_base = *basep;
1920 save_len = *lenp;
1921 save_minlen = minlen;
1922 save_redzone = redzone;
1923
1924 /*
1925 * For the first pass/fast_path, just add align and redzone into
1926 * minlen since if we get an allocation, we can guarantee that it
1927 * will fit the alignment and redzone requested.
1928 * This increases the chance that hibound will be adjusted to
1929 * a_lastgap->s_base which will likely allow us to find an
1930 * acceptable hole in the address space quicker.
1931 * If we can't find a hole with this fast_path, then we look for
1932 * smaller holes in which the alignment and offset may allow
1933 * the allocation to fit.
1934 */
1935 minlen += align;
1936 minlen += 2 * redzone;
1937 redzone = 0;
1938
1939 AS_LOCK_ENTER(as, RW_READER);
1940 if (AS_SEGFIRST(as) == NULL) {
1941 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1942 align, redzone, off)) {
1943 AS_LOCK_EXIT(as);
1944 return (0);
1945 } else {
1946 AS_LOCK_EXIT(as);
1947 *basep = save_base;
1948 *lenp = save_len;
1949 return (-1);
1950 }
1951 }
1952
1953 retry:
1954 /*
1955 * Set up to iterate over all the inter-segment holes in the given
1956 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1957 * NULL for the highest-addressed hole. If moving backwards, we reset
1958 * sseg to denote the highest-addressed segment.
1959 */
1960 forward = (flags & AH_DIR) == AH_LO;
1961 if (forward) {
1962 hseg = as_findseg(as, lobound, 1);
1963 lseg = AS_SEGPREV(as, hseg);
1964 } else {
1965
1966 /*
2007 lo = lobound;
2008 if (hi > hibound)
2009 hi = hibound;
2010 /*
2011 * Verify that the candidate hole is big enough and meets
2012 * hardware constraints. If the hole is too small, no need
2013 * to do the further checks since they will fail.
2014 */
2015 *basep = lo;
2016 *lenp = hi - lo;
2017 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
2018 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
2019 ((flags & AH_CONTAIN) == 0 ||
2020 (*basep <= addr && *basep + *lenp > addr))) {
2021 if (!forward)
2022 as->a_lastgap = hseg;
2023 if (hseg != NULL)
2024 as->a_lastgaphl = hseg;
2025 else
2026 as->a_lastgaphl = lseg;
2027 AS_LOCK_EXIT(as);
2028 return (0);
2029 }
2030 cont:
2031 /*
2032 * Move to the next hole.
2033 */
2034 if (forward) {
2035 lseg = hseg;
2036 if (lseg == NULL)
2037 break;
2038 hseg = AS_SEGNEXT(as, hseg);
2039 } else {
2040 hseg = lseg;
2041 if (hseg == NULL)
2042 break;
2043 lseg = AS_SEGPREV(as, lseg);
2044 }
2045 }
2046 if (fast_path && (align != 0 || save_redzone != 0)) {
2047 fast_path = 0;
2048 minlen = save_minlen;
2049 redzone = save_redzone;
2050 goto retry;
2051 }
2052 *basep = save_base;
2053 *lenp = save_len;
2054 AS_LOCK_EXIT(as);
2055 return (-1);
2056 }
2057
2058 /*
2059 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2060 *
2061 * If flags specifies AH_HI, the hole will have the highest possible address
2062 * in the range. We use the as->a_lastgap field to figure out where to
2063 * start looking for a gap.
2064 *
2065 * Otherwise, the gap will have the lowest possible address.
2066 *
2067 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2068 *
2069 * If an adequate hole is found, base and len are set to reflect the part of
2070 * the hole that is within range, and 0 is returned, otherwise,
2071 * -1 is returned.
2072 *
2073 * NOTE: This routine is not correct when base+len overflows caddr_t.
2074 */
2076 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2077 caddr_t addr)
2078 {
2079
2080 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2081 }
2082
2083 /*
2084 * Return the next range within [base, base + len) that is backed
2085 * with "real memory". Skip holes and non-seg_vn segments.
2086 * We're lazy and only return one segment at a time.
2087 */
2088 int
2089 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2090 {
2091 extern struct seg_ops segspt_shmops; /* needs a header file */
2092 struct seg *seg;
2093 caddr_t addr, eaddr;
2094 caddr_t segend;
2095
2096 AS_LOCK_ENTER(as, RW_READER);
2097
2098 addr = *basep;
2099 eaddr = addr + *lenp;
2100
2101 seg = as_findseg(as, addr, 0);
2102 if (seg != NULL)
2103 addr = MAX(seg->s_base, addr);
2104
2105 for (;;) {
2106 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2107 AS_LOCK_EXIT(as);
2108 return (EINVAL);
2109 }
2110
2111 if (seg->s_ops == &segvn_ops) {
2112 segend = seg->s_base + seg->s_size;
2113 break;
2114 }
2115
2116 /*
2117 * We do ISM by looking into the private data
2118 * to determine the real size of the segment.
2119 */
2120 if (seg->s_ops == &segspt_shmops) {
2121 segend = seg->s_base + spt_realsize(seg);
2122 if (addr < segend)
2123 break;
2124 }
2125
2126 seg = AS_SEGNEXT(as, seg);
2127
2128 if (seg != NULL)
2129 addr = seg->s_base;
2130 }
2131
2132 *basep = addr;
2133
2134 if (segend > eaddr)
2135 *lenp = eaddr - addr;
2136 else
2137 *lenp = segend - addr;
2138
2139 AS_LOCK_EXIT(as);
2140 return (0);
2141 }
2142
2143 /*
2144 * Swap the pages associated with the address space as out to
2145 * secondary storage, returning the number of bytes actually
2146 * swapped.
2147 *
2148 * The value returned is intended to correlate well with the process's
2149 * memory requirements. Its usefulness for this purpose depends on
2150 * how well the segment-level routines do at returning accurate
2151 * information.
2152 */
2153 size_t
2154 as_swapout(struct as *as)
2155 {
2156 struct seg *seg;
2157 size_t swpcnt = 0;
2158
2159 /*
2160 * Kernel-only processes have given up their address
2161 * spaces. Of course, we shouldn't be attempting to
2162 * swap out such processes in the first place...
2163 */
2164 if (as == NULL)
2165 return (0);
2166
2167 AS_LOCK_ENTER(as, RW_READER);
2168
2169 /* Prevent XHATs from attaching */
2170 mutex_enter(&as->a_contents);
2171 AS_SETBUSY(as);
2172 mutex_exit(&as->a_contents);
2173
2174
2175 /*
2176 * Free all mapping resources associated with the address
2177 * space. The segment-level swapout routines capitalize
2178 * on this unmapping by scavanging pages that have become
2179 * unmapped here.
2180 */
2181 hat_swapout(as->a_hat);
2182 if (as->a_xhat != NULL)
2183 xhat_swapout_all(as);
2184
2185 mutex_enter(&as->a_contents);
2186 AS_CLRBUSY(as);
2187 mutex_exit(&as->a_contents);
2188
2189 /*
2190 * Call the swapout routines of all segments in the address
2191 * space to do the actual work, accumulating the amount of
2192 * space reclaimed.
2193 */
2194 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2195 struct seg_ops *ov = seg->s_ops;
2196
2197 /*
2198 * We have to check to see if the seg has
2199 * an ops vector because the seg may have
2200 * been in the middle of being set up when
2201 * the process was picked for swapout.
2202 */
2203 if ((ov != NULL) && (ov->swapout != NULL))
2204 swpcnt += SEGOP_SWAPOUT(seg);
2205 }
2206 AS_LOCK_EXIT(as);
2207 return (swpcnt);
2208 }
2209
2210 /*
2211 * Determine whether data from the mappings in interval [addr, addr + size)
2212 * are in the primary memory (core) cache.
2213 */
2214 int
2215 as_incore(struct as *as, caddr_t addr,
2216 size_t size, char *vec, size_t *sizep)
2217 {
2218 struct seg *seg;
2219 size_t ssize;
2220 caddr_t raddr; /* rounded down addr */
2221 size_t rsize; /* rounded up size */
2222 size_t isize; /* iteration size */
2223 int error = 0; /* result, assume success */
2224
2225 *sizep = 0;
2226 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2227 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2228 (size_t)raddr;
2229
2230 if (raddr + rsize < raddr) /* check for wraparound */
2231 return (ENOMEM);
2232
2233 AS_LOCK_ENTER(as, RW_READER);
2234 seg = as_segat(as, raddr);
2235 if (seg == NULL) {
2236 AS_LOCK_EXIT(as);
2237 return (-1);
2238 }
2239
2240 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2241 if (raddr >= seg->s_base + seg->s_size) {
2242 seg = AS_SEGNEXT(as, seg);
2243 if (seg == NULL || raddr != seg->s_base) {
2244 error = -1;
2245 break;
2246 }
2247 }
2248 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2249 ssize = seg->s_base + seg->s_size - raddr;
2250 else
2251 ssize = rsize;
2252 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2253 if (isize != ssize) {
2254 error = -1;
2255 break;
2256 }
2257 vec += btopr(ssize);
2258 }
2259 AS_LOCK_EXIT(as);
2260 return (error);
2261 }
2262
2263 static void
2264 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2265 ulong_t *bitmap, size_t position, size_t npages)
2266 {
2267 caddr_t range_start;
2268 size_t pos1 = position;
2269 size_t pos2;
2270 size_t size;
2271 size_t end_pos = npages + position;
2272
2273 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2274 size = ptob((pos2 - pos1));
2275 range_start = (caddr_t)((uintptr_t)addr +
2276 ptob(pos1 - position));
2277
2278 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2279 (ulong_t *)NULL, (size_t)NULL);
2309 * address space "as".
2310 */
2311 /*ARGSUSED*/
2312 int
2313 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2314 uintptr_t arg, ulong_t *lock_map, size_t pos)
2315 {
2316 struct seg *seg; /* working segment */
2317 caddr_t raddr; /* rounded down addr */
2318 caddr_t initraddr; /* saved initial rounded down addr */
2319 size_t rsize; /* rounded up size */
2320 size_t initrsize; /* saved initial rounded up size */
2321 size_t ssize; /* size of seg */
2322 int error = 0; /* result */
2323 size_t mlock_size; /* size of bitmap */
2324 ulong_t *mlock_map; /* pointer to bitmap used */
2325 /* to represent the locked */
2326 /* pages. */
2327 retry:
2328 if (error == IE_RETRY)
2329 AS_LOCK_ENTER(as, RW_WRITER);
2330 else
2331 AS_LOCK_ENTER(as, RW_READER);
2332
2333 /*
2334 * If these are address space lock/unlock operations, loop over
2335 * all segments in the address space, as appropriate.
2336 */
2337 if (func == MC_LOCKAS) {
2338 size_t npages, idx;
2339 size_t rlen = 0; /* rounded as length */
2340
2341 idx = pos;
2342
2343 if (arg & MCL_FUTURE) {
2344 mutex_enter(&as->a_contents);
2345 AS_SETPGLCK(as);
2346 mutex_exit(&as->a_contents);
2347 }
2348 if ((arg & MCL_CURRENT) == 0) {
2349 AS_LOCK_EXIT(as);
2350 return (0);
2351 }
2352
2353 seg = AS_SEGFIRST(as);
2354 if (seg == NULL) {
2355 AS_LOCK_EXIT(as);
2356 return (0);
2357 }
2358
2359 do {
2360 raddr = (caddr_t)((uintptr_t)seg->s_base &
2361 (uintptr_t)PAGEMASK);
2362 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2363 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2364 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2365
2366 mlock_size = BT_BITOUL(btopr(rlen));
2367 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2368 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2369 AS_LOCK_EXIT(as);
2370 return (EAGAIN);
2371 }
2372
2373 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2374 error = SEGOP_LOCKOP(seg, seg->s_base,
2375 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2376 if (error != 0)
2377 break;
2378 pos += seg_pages(seg);
2379 }
2380
2381 if (error) {
2382 for (seg = AS_SEGFIRST(as); seg != NULL;
2383 seg = AS_SEGNEXT(as, seg)) {
2384
2385 raddr = (caddr_t)((uintptr_t)seg->s_base &
2386 (uintptr_t)PAGEMASK);
2387 npages = seg_pages(seg);
2388 as_segunlock(seg, raddr, attr, mlock_map,
2389 idx, npages);
2390 idx += npages;
2391 }
2392 }
2393
2394 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2395 AS_LOCK_EXIT(as);
2396 goto lockerr;
2397 } else if (func == MC_UNLOCKAS) {
2398 mutex_enter(&as->a_contents);
2399 AS_CLRPGLCK(as);
2400 mutex_exit(&as->a_contents);
2401
2402 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2403 error = SEGOP_LOCKOP(seg, seg->s_base,
2404 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2405 if (error != 0)
2406 break;
2407 }
2408
2409 AS_LOCK_EXIT(as);
2410 goto lockerr;
2411 }
2412
2413 /*
2414 * Normalize addresses and sizes.
2415 */
2416 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2417 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2418 (size_t)raddr;
2419
2420 if (raddr + rsize < raddr) { /* check for wraparound */
2421 AS_LOCK_EXIT(as);
2422 return (ENOMEM);
2423 }
2424
2425 /*
2426 * Get initial segment.
2427 */
2428 if ((seg = as_segat(as, raddr)) == NULL) {
2429 AS_LOCK_EXIT(as);
2430 return (ENOMEM);
2431 }
2432
2433 if (func == MC_LOCK) {
2434 mlock_size = BT_BITOUL(btopr(rsize));
2435 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2436 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2437 AS_LOCK_EXIT(as);
2438 return (EAGAIN);
2439 }
2440 }
2441
2442 /*
2443 * Loop over all segments. If a hole in the address range is
2444 * discovered, then fail. For each segment, perform the appropriate
2445 * control operation.
2446 */
2447 while (rsize != 0) {
2448
2449 /*
2450 * Make sure there's no hole, calculate the portion
2451 * of the next segment to be operated over.
2452 */
2453 if (raddr >= seg->s_base + seg->s_size) {
2454 seg = AS_SEGNEXT(as, seg);
2455 if (seg == NULL || raddr != seg->s_base) {
2456 if (func == MC_LOCK) {
2457 as_unlockerr(as, attr, mlock_map,
2458 initraddr, initrsize - rsize);
2459 kmem_free(mlock_map,
2460 mlock_size * sizeof (ulong_t));
2461 }
2462 AS_LOCK_EXIT(as);
2463 return (ENOMEM);
2464 }
2465 }
2466 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2467 ssize = seg->s_base + seg->s_size - raddr;
2468 else
2469 ssize = rsize;
2470
2471 /*
2472 * Dispatch on specific function.
2473 */
2474 switch (func) {
2475
2476 /*
2477 * Synchronize cached data from mappings with backing
2478 * objects.
2479 */
2480 case MC_SYNC:
2481 if (error = SEGOP_SYNC(seg, raddr, ssize,
2482 attr, (uint_t)arg)) {
2483 AS_LOCK_EXIT(as);
2484 return (error);
2485 }
2486 break;
2487
2488 /*
2489 * Lock pages in memory.
2490 */
2491 case MC_LOCK:
2492 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2493 attr, func, mlock_map, pos)) {
2494 as_unlockerr(as, attr, mlock_map, initraddr,
2495 initrsize - rsize + ssize);
2496 kmem_free(mlock_map, mlock_size *
2497 sizeof (ulong_t));
2498 AS_LOCK_EXIT(as);
2499 goto lockerr;
2500 }
2501 break;
2502
2503 /*
2504 * Unlock mapped pages.
2505 */
2506 case MC_UNLOCK:
2507 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2508 (ulong_t *)NULL, (size_t)NULL);
2509 break;
2510
2511 /*
2512 * Store VM advise for mapped pages in segment layer.
2513 */
2514 case MC_ADVISE:
2515 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2516
2517 /*
2518 * Check for regular errors and special retry error
2519 */
2520 if (error) {
2521 if (error == IE_RETRY) {
2522 /*
2523 * Need to acquire writers lock, so
2524 * have to drop readers lock and start
2525 * all over again
2526 */
2527 AS_LOCK_EXIT(as);
2528 goto retry;
2529 } else if (error == IE_REATTACH) {
2530 /*
2531 * Find segment for current address
2532 * because current segment just got
2533 * split or concatenated
2534 */
2535 seg = as_segat(as, raddr);
2536 if (seg == NULL) {
2537 AS_LOCK_EXIT(as);
2538 return (ENOMEM);
2539 }
2540 } else {
2541 /*
2542 * Regular error
2543 */
2544 AS_LOCK_EXIT(as);
2545 return (error);
2546 }
2547 }
2548 break;
2549
2550 case MC_INHERIT_ZERO:
2551 if (seg->s_ops->inherit == NULL) {
2552 error = ENOTSUP;
2553 } else {
2554 error = SEGOP_INHERIT(seg, raddr, ssize,
2555 SEGP_INH_ZERO);
2556 }
2557 if (error != 0) {
2558 AS_LOCK_EXIT(as);
2559 return (error);
2560 }
2561 break;
2562
2563 /*
2564 * Can't happen.
2565 */
2566 default:
2567 panic("as_ctl: bad operation %d", func);
2568 /*NOTREACHED*/
2569 }
2570
2571 rsize -= ssize;
2572 raddr += ssize;
2573 }
2574
2575 if (func == MC_LOCK)
2576 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2577 AS_LOCK_EXIT(as);
2578 return (0);
2579 lockerr:
2580
2581 /*
2582 * If the lower levels returned EDEADLK for a segment lockop,
2583 * it means that we should retry the operation. Let's wait
2584 * a bit also to let the deadlock causing condition clear.
2585 * This is part of a gross hack to work around a design flaw
2586 * in the ufs/sds logging code and should go away when the
2587 * logging code is re-designed to fix the problem. See bug
2588 * 4125102 for details of the problem.
2589 */
2590 if (error == EDEADLK) {
2591 delay(deadlk_wait);
2592 error = 0;
2593 goto retry;
2594 }
2595 return (error);
2596 }
2597
2622 */
2623 static int
2624 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2625 caddr_t addr, size_t size, enum seg_rw rw)
2626 {
2627 caddr_t sv_addr = addr;
2628 size_t sv_size = size;
2629 struct seg *sv_seg = seg;
2630 ulong_t segcnt = 1;
2631 ulong_t cnt;
2632 size_t ssize;
2633 pgcnt_t npages = btop(size);
2634 page_t **plist;
2635 page_t **pl;
2636 int error;
2637 caddr_t eaddr;
2638 faultcode_t fault_err = 0;
2639 pgcnt_t pl_off;
2640 extern struct seg_ops segspt_shmops;
2641
2642 ASSERT(AS_LOCK_HELD(as));
2643 ASSERT(seg != NULL);
2644 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2645 ASSERT(addr + size > seg->s_base + seg->s_size);
2646 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2647 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2648
2649 /*
2650 * Count the number of segments covered by the range we are about to
2651 * lock. The segment count is used to size the shadow list we return
2652 * back to the caller.
2653 */
2654 for (; size != 0; size -= ssize, addr += ssize) {
2655 if (addr >= seg->s_base + seg->s_size) {
2656
2657 seg = AS_SEGNEXT(as, seg);
2658 if (seg == NULL || addr != seg->s_base) {
2659 AS_LOCK_EXIT(as);
2660 return (EFAULT);
2661 }
2662 /*
2663 * Do a quick check if subsequent segments
2664 * will most likely support pagelock.
2665 */
2666 if (seg->s_ops == &segvn_ops) {
2667 vnode_t *vp;
2668
2669 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2670 vp != NULL) {
2671 AS_LOCK_EXIT(as);
2672 goto slow;
2673 }
2674 } else if (seg->s_ops != &segspt_shmops) {
2675 AS_LOCK_EXIT(as);
2676 goto slow;
2677 }
2678 segcnt++;
2679 }
2680 if (addr + size > seg->s_base + seg->s_size) {
2681 ssize = seg->s_base + seg->s_size - addr;
2682 } else {
2683 ssize = size;
2684 }
2685 }
2686 ASSERT(segcnt > 1);
2687
2688 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2689
2690 addr = sv_addr;
2691 size = sv_size;
2692 seg = sv_seg;
2693
2694 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2695 if (addr >= seg->s_base + seg->s_size) {
2700 }
2701 if (addr + size > seg->s_base + seg->s_size) {
2702 ssize = seg->s_base + seg->s_size - addr;
2703 } else {
2704 ssize = size;
2705 }
2706 pl = &plist[npages + cnt];
2707 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2708 L_PAGELOCK, rw);
2709 if (error) {
2710 break;
2711 }
2712 ASSERT(plist[npages + cnt] != NULL);
2713 ASSERT(pl_off + btop(ssize) <= npages);
2714 bcopy(plist[npages + cnt], &plist[pl_off],
2715 btop(ssize) * sizeof (page_t *));
2716 pl_off += btop(ssize);
2717 }
2718
2719 if (size == 0) {
2720 AS_LOCK_EXIT(as);
2721 ASSERT(cnt == segcnt - 1);
2722 *ppp = plist;
2723 return (0);
2724 }
2725
2726 /*
2727 * one of pagelock calls failed. The error type is in error variable.
2728 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2729 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2730 * back to the caller.
2731 */
2732
2733 eaddr = addr;
2734 seg = sv_seg;
2735
2736 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2737 if (addr >= seg->s_base + seg->s_size) {
2738 seg = AS_SEGNEXT(as, seg);
2739 ASSERT(seg != NULL && addr == seg->s_base);
2740 cnt++;
2741 ASSERT(cnt < segcnt);
2742 }
2743 if (eaddr > seg->s_base + seg->s_size) {
2744 ssize = seg->s_base + seg->s_size - addr;
2745 } else {
2746 ssize = eaddr - addr;
2747 }
2748 pl = &plist[npages + cnt];
2749 ASSERT(*pl != NULL);
2750 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2751 L_PAGEUNLOCK, rw);
2752 }
2753
2754 AS_LOCK_EXIT(as);
2755
2756 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2757
2758 if (error != ENOTSUP && error != EFAULT) {
2759 return (error);
2760 }
2761
2762 slow:
2763 /*
2764 * If we are here because pagelock failed due to the need to cow fault
2765 * in the pages we want to lock F_SOFTLOCK will do this job and in
2766 * next as_pagelock() call for this address range pagelock will
2767 * hopefully succeed.
2768 */
2769 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2770 if (fault_err != 0) {
2771 return (fc_decode(fault_err));
2772 }
2773 *ppp = NULL;
2774
2783 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2784 size_t size, enum seg_rw rw)
2785 {
2786 size_t rsize;
2787 caddr_t raddr;
2788 faultcode_t fault_err;
2789 struct seg *seg;
2790 int err;
2791
2792 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2793 "as_pagelock_start: addr %p size %ld", addr, size);
2794
2795 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2796 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2797 (size_t)raddr;
2798
2799 /*
2800 * if the request crosses two segments let
2801 * as_fault handle it.
2802 */
2803 AS_LOCK_ENTER(as, RW_READER);
2804
2805 seg = as_segat(as, raddr);
2806 if (seg == NULL) {
2807 AS_LOCK_EXIT(as);
2808 return (EFAULT);
2809 }
2810 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2811 if (raddr + rsize > seg->s_base + seg->s_size) {
2812 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2813 }
2814 if (raddr + rsize <= raddr) {
2815 AS_LOCK_EXIT(as);
2816 return (EFAULT);
2817 }
2818
2819 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2820 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2821
2822 /*
2823 * try to lock pages and pass back shadow list
2824 */
2825 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2826
2827 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2828
2829 AS_LOCK_EXIT(as);
2830
2831 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2832 return (err);
2833 }
2834
2835 /*
2836 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2837 * to no pagelock support for this segment or pages need to be cow
2838 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2839 * this as_pagelock() call and in the next as_pagelock() call for the
2840 * same address range pagelock call will hopefull succeed.
2841 */
2842 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2843 if (fault_err != 0) {
2844 return (fc_decode(fault_err));
2845 }
2846 *ppp = NULL;
2847
2848 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2849 return (0);
2850 }
2851
2852 /*
2853 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2854 * lists from the end of plist and call pageunlock interface for each segment.
2855 * Drop as lock and free plist.
2856 */
2857 static void
2858 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2859 struct page **plist, enum seg_rw rw)
2860 {
2861 ulong_t cnt;
2862 caddr_t eaddr = addr + size;
2863 pgcnt_t npages = btop(size);
2864 size_t ssize;
2865 page_t **pl;
2866
2867 ASSERT(AS_LOCK_HELD(as));
2868 ASSERT(seg != NULL);
2869 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2870 ASSERT(addr + size > seg->s_base + seg->s_size);
2871 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2872 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2873 ASSERT(plist != NULL);
2874
2875 for (cnt = 0; addr < eaddr; addr += ssize) {
2876 if (addr >= seg->s_base + seg->s_size) {
2877 seg = AS_SEGNEXT(as, seg);
2878 ASSERT(seg != NULL && addr == seg->s_base);
2879 cnt++;
2880 }
2881 if (eaddr > seg->s_base + seg->s_size) {
2882 ssize = seg->s_base + seg->s_size - addr;
2883 } else {
2884 ssize = eaddr - addr;
2885 }
2886 pl = &plist[npages + cnt];
2887 ASSERT(*pl != NULL);
2888 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2889 L_PAGEUNLOCK, rw);
2890 }
2891 ASSERT(cnt > 0);
2892 AS_LOCK_EXIT(as);
2893
2894 cnt++;
2895 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2896 }
2897
2898 /*
2899 * unlock pages in a given address range
2900 */
2901 void
2902 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2903 enum seg_rw rw)
2904 {
2905 struct seg *seg;
2906 size_t rsize;
2907 caddr_t raddr;
2908
2909 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2910 "as_pageunlock_start: addr %p size %ld", addr, size);
2911
2912 /*
2913 * if the shadow list is NULL, as_pagelock was
2914 * falling back to as_fault
2915 */
2916 if (pp == NULL) {
2917 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2918 return;
2919 }
2920
2921 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2922 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2923 (size_t)raddr;
2924
2925 AS_LOCK_ENTER(as, RW_READER);
2926 seg = as_segat(as, raddr);
2927 ASSERT(seg != NULL);
2928
2929 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2930 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2931
2932 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2933 if (raddr + rsize <= seg->s_base + seg->s_size) {
2934 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2935 } else {
2936 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2937 return;
2938 }
2939 AS_LOCK_EXIT(as);
2940 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2941 }
2942
2943 int
2944 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2945 boolean_t wait)
2946 {
2947 struct seg *seg;
2948 size_t ssize;
2949 caddr_t raddr; /* rounded down addr */
2950 size_t rsize; /* rounded up size */
2951 int error = 0;
2952 size_t pgsz = page_get_pagesize(szc);
2953
2954 setpgsz_top:
2955 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2956 return (EINVAL);
2957 }
2958
2959 raddr = addr;
2960 rsize = size;
2961
2962 if (raddr + rsize < raddr) /* check for wraparound */
2963 return (ENOMEM);
2964
2965 AS_LOCK_ENTER(as, RW_WRITER);
2966 as_clearwatchprot(as, raddr, rsize);
2967 seg = as_segat(as, raddr);
2968 if (seg == NULL) {
2969 as_setwatch(as);
2970 AS_LOCK_EXIT(as);
2971 return (ENOMEM);
2972 }
2973
2974 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2975 if (raddr >= seg->s_base + seg->s_size) {
2976 seg = AS_SEGNEXT(as, seg);
2977 if (seg == NULL || raddr != seg->s_base) {
2978 error = ENOMEM;
2979 break;
2980 }
2981 }
2982 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2983 ssize = seg->s_base + seg->s_size - raddr;
2984 } else {
2985 ssize = rsize;
2986 }
2987
2988 retry:
2989 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2990
2991 if (error == IE_NOMEM) {
2992 error = EAGAIN;
2993 break;
2994 }
2995
2996 if (error == IE_RETRY) {
2997 AS_LOCK_EXIT(as);
2998 goto setpgsz_top;
2999 }
3000
3001 if (error == ENOTSUP) {
3002 error = EINVAL;
3003 break;
3004 }
3005
3006 if (wait && (error == EAGAIN)) {
3007 /*
3008 * Memory is currently locked. It must be unlocked
3009 * before this operation can succeed through a retry.
3010 * The possible reasons for locked memory and
3011 * corresponding strategies for unlocking are:
3012 * (1) Normal I/O
3013 * wait for a signal that the I/O operation
3014 * has completed and the memory is unlocked.
3015 * (2) Asynchronous I/O
3016 * The aio subsystem does not unlock pages when
3017 * the I/O is completed. Those pages are unlocked
3018 * when the application calls aiowait/aioerror.
3019 * So, to prevent blocking forever, cv_broadcast()
3020 * is done to wake up aio_cleanup_thread.
3021 * Subsequently, segvn_reclaim will be called, and
3022 * that will do AS_CLRUNMAPWAIT() and wake us up.
3023 * (3) Long term page locking:
3024 * This is not relevant for as_setpagesize()
3025 * because we cannot change the page size for
3026 * driver memory. The attempt to do so will
3027 * fail with a different error than EAGAIN so
3028 * there's no need to trigger as callbacks like
3029 * as_unmap, as_setprot or as_free would do.
3030 */
3031 mutex_enter(&as->a_contents);
3032 if (!AS_ISNOUNMAPWAIT(as)) {
3033 if (AS_ISUNMAPWAIT(as) == 0) {
3034 cv_broadcast(&as->a_cv);
3035 }
3036 AS_SETUNMAPWAIT(as);
3037 AS_LOCK_EXIT(as);
3038 while (AS_ISUNMAPWAIT(as)) {
3039 cv_wait(&as->a_cv, &as->a_contents);
3040 }
3041 } else {
3042 /*
3043 * We may have raced with
3044 * segvn_reclaim()/segspt_reclaim(). In this
3045 * case clean nounmapwait flag and retry since
3046 * softlockcnt in this segment may be already
3047 * 0. We don't drop as writer lock so our
3048 * number of retries without sleeping should
3049 * be very small. See segvn_reclaim() for
3050 * more comments.
3051 */
3052 AS_CLRNOUNMAPWAIT(as);
3053 mutex_exit(&as->a_contents);
3054 goto retry;
3055 }
3056 mutex_exit(&as->a_contents);
3057 goto setpgsz_top;
3058 } else if (error != 0) {
3059 break;
3060 }
3061 }
3062 as_setwatch(as);
3063 AS_LOCK_EXIT(as);
3064 return (error);
3065 }
3066
3067 /*
3068 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3069 * in its chunk where s_szc is less than the szc we want to set.
3070 */
3071 static int
3072 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3073 int *retry)
3074 {
3075 struct seg *seg;
3076 size_t ssize;
3077 int error;
3078
3079 ASSERT(AS_WRITE_HELD(as));
3080
3081 seg = as_segat(as, raddr);
3082 if (seg == NULL) {
3083 panic("as_iset3_default_lpsize: no seg");
3084 }
3085
3086 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3087 if (raddr >= seg->s_base + seg->s_size) {
3088 seg = AS_SEGNEXT(as, seg);
3089 if (seg == NULL || raddr != seg->s_base) {
3090 panic("as_iset3_default_lpsize: as changed");
3091 }
3092 }
3093 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3094 ssize = seg->s_base + seg->s_size - raddr;
3095 } else {
3096 ssize = rsize;
3097 }
3098
3099 if (szc > seg->s_szc) {
3116 }
3117 return (0);
3118 }
3119
3120 /*
3121 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3122 * pagesize on each segment in its range, but if any fails with EINVAL,
3123 * then it reduces the pagesizes to the next size in the bitmap and
3124 * retries as_iset3_default_lpsize(). The reason why the code retries
3125 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3126 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3127 * with) to pass to map_pgszcvec().
3128 */
3129 static int
3130 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3131 uint_t szcvec)
3132 {
3133 int error;
3134 int retry;
3135
3136 ASSERT(AS_WRITE_HELD(as));
3137
3138 for (;;) {
3139 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3140 if (error == EINVAL && retry) {
3141 szcvec &= ~(1 << szc);
3142 if (szcvec <= 1) {
3143 return (EINVAL);
3144 }
3145 szc = highbit(szcvec) - 1;
3146 } else {
3147 return (error);
3148 }
3149 }
3150 }
3151
3152 /*
3153 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3154 * segments have a smaller szc than we want to set. For each such area,
3155 * it calls as_iset2_default_lpsize()
3156 */
3157 static int
3158 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3159 uint_t szcvec)
3160 {
3161 struct seg *seg;
3162 size_t ssize;
3163 caddr_t setaddr = raddr;
3164 size_t setsize = 0;
3165 int set;
3166 int error;
3167
3168 ASSERT(AS_WRITE_HELD(as));
3169
3170 seg = as_segat(as, raddr);
3171 if (seg == NULL) {
3172 panic("as_iset1_default_lpsize: no seg");
3173 }
3174 if (seg->s_szc < szc) {
3175 set = 1;
3176 } else {
3177 set = 0;
3178 }
3179
3180 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3181 if (raddr >= seg->s_base + seg->s_size) {
3182 seg = AS_SEGNEXT(as, seg);
3183 if (seg == NULL || raddr != seg->s_base) {
3184 panic("as_iset1_default_lpsize: as changed");
3185 }
3186 if (seg->s_szc >= szc && set) {
3187 ASSERT(setsize != 0);
3188 error = as_iset2_default_lpsize(as,
3216 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3217 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3218 * chunk to as_iset1_default_lpsize().
3219 */
3220 static int
3221 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3222 int type)
3223 {
3224 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3225 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3226 flags, rtype, 1);
3227 uint_t szc;
3228 uint_t nszc;
3229 int error;
3230 caddr_t a;
3231 caddr_t eaddr;
3232 size_t segsize;
3233 size_t pgsz;
3234 uint_t save_szcvec;
3235
3236 ASSERT(AS_WRITE_HELD(as));
3237 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3238 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3239
3240 szcvec &= ~1;
3241 if (szcvec <= 1) { /* skip if base page size */
3242 return (0);
3243 }
3244
3245 /* Get the pagesize of the first larger page size. */
3246 szc = lowbit(szcvec) - 1;
3247 pgsz = page_get_pagesize(szc);
3248 eaddr = addr + size;
3249 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3250 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3251
3252 save_szcvec = szcvec;
3253 szcvec >>= (szc + 1);
3254 nszc = szc;
3255 while (szcvec) {
3256 if ((szcvec & 0x1) == 0) {
3308 * chunks with the same type/flags, ignores-non segvn segments, and passes
3309 * each chunk to as_iset_default_lpsize().
3310 */
3311 int
3312 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3313 {
3314 struct seg *seg;
3315 caddr_t raddr;
3316 size_t rsize;
3317 size_t ssize;
3318 int rtype, rflags;
3319 int stype, sflags;
3320 int error;
3321 caddr_t setaddr;
3322 size_t setsize;
3323 int segvn;
3324
3325 if (size == 0)
3326 return (0);
3327
3328 AS_LOCK_ENTER(as, RW_WRITER);
3329 again:
3330 error = 0;
3331
3332 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3333 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3334 (size_t)raddr;
3335
3336 if (raddr + rsize < raddr) { /* check for wraparound */
3337 AS_LOCK_EXIT(as);
3338 return (ENOMEM);
3339 }
3340 as_clearwatchprot(as, raddr, rsize);
3341 seg = as_segat(as, raddr);
3342 if (seg == NULL) {
3343 as_setwatch(as);
3344 AS_LOCK_EXIT(as);
3345 return (ENOMEM);
3346 }
3347 if (seg->s_ops == &segvn_ops) {
3348 rtype = SEGOP_GETTYPE(seg, addr);
3349 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3350 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3351 segvn = 1;
3352 } else {
3353 segvn = 0;
3354 }
3355 setaddr = raddr;
3356 setsize = 0;
3357
3358 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3359 if (raddr >= (seg->s_base + seg->s_size)) {
3360 seg = AS_SEGNEXT(as, seg);
3361 if (seg == NULL || raddr != seg->s_base) {
3362 error = ENOMEM;
3363 break;
3364 }
3409 if (error == 0 && segvn) {
3410 /* The last chunk when rsize == 0. */
3411 ASSERT(setsize != 0);
3412 error = as_iset_default_lpsize(as, setaddr, setsize,
3413 rflags, rtype);
3414 }
3415
3416 if (error == IE_RETRY) {
3417 goto again;
3418 } else if (error == IE_NOMEM) {
3419 error = EAGAIN;
3420 } else if (error == ENOTSUP) {
3421 error = EINVAL;
3422 } else if (error == EAGAIN) {
3423 mutex_enter(&as->a_contents);
3424 if (!AS_ISNOUNMAPWAIT(as)) {
3425 if (AS_ISUNMAPWAIT(as) == 0) {
3426 cv_broadcast(&as->a_cv);
3427 }
3428 AS_SETUNMAPWAIT(as);
3429 AS_LOCK_EXIT(as);
3430 while (AS_ISUNMAPWAIT(as)) {
3431 cv_wait(&as->a_cv, &as->a_contents);
3432 }
3433 mutex_exit(&as->a_contents);
3434 AS_LOCK_ENTER(as, RW_WRITER);
3435 } else {
3436 /*
3437 * We may have raced with
3438 * segvn_reclaim()/segspt_reclaim(). In this case
3439 * clean nounmapwait flag and retry since softlockcnt
3440 * in this segment may be already 0. We don't drop as
3441 * writer lock so our number of retries without
3442 * sleeping should be very small. See segvn_reclaim()
3443 * for more comments.
3444 */
3445 AS_CLRNOUNMAPWAIT(as);
3446 mutex_exit(&as->a_contents);
3447 }
3448 goto again;
3449 }
3450
3451 as_setwatch(as);
3452 AS_LOCK_EXIT(as);
3453 return (error);
3454 }
3455
3456 /*
3457 * Setup all of the uninitialized watched pages that we can.
3458 */
3459 void
3460 as_setwatch(struct as *as)
3461 {
3462 struct watched_page *pwp;
3463 struct seg *seg;
3464 caddr_t vaddr;
3465 uint_t prot;
3466 int err, retrycnt;
3467
3468 if (avl_numnodes(&as->a_wpage) == 0)
3469 return;
3470
3471 ASSERT(AS_WRITE_HELD(as));
3472
3473 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3474 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3475 retrycnt = 0;
3476 retry:
3477 vaddr = pwp->wp_vaddr;
3478 if (pwp->wp_oprot != 0 || /* already set up */
3479 (seg = as_segat(as, vaddr)) == NULL ||
3480 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3481 continue;
3482
3483 pwp->wp_oprot = prot;
3484 if (pwp->wp_read)
3485 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3486 if (pwp->wp_write)
3487 prot &= ~PROT_WRITE;
3488 if (pwp->wp_exec)
3489 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3490 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3491 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3498 }
3499 pwp->wp_prot = prot;
3500 }
3501 }
3502
3503 /*
3504 * Clear all of the watched pages in the address space.
3505 */
3506 void
3507 as_clearwatch(struct as *as)
3508 {
3509 struct watched_page *pwp;
3510 struct seg *seg;
3511 caddr_t vaddr;
3512 uint_t prot;
3513 int err, retrycnt;
3514
3515 if (avl_numnodes(&as->a_wpage) == 0)
3516 return;
3517
3518 ASSERT(AS_WRITE_HELD(as));
3519
3520 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3521 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3522 retrycnt = 0;
3523 retry:
3524 vaddr = pwp->wp_vaddr;
3525 if (pwp->wp_oprot == 0 || /* not set up */
3526 (seg = as_segat(as, vaddr)) == NULL)
3527 continue;
3528
3529 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3530 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3531 if (err == IE_RETRY) {
3532 ASSERT(retrycnt == 0);
3533 retrycnt++;
3534 goto retry;
3535 }
3536 }
3537 pwp->wp_oprot = 0;
3538 pwp->wp_prot = 0;
3540 }
3541
3542 /*
3543 * Force a new setup for all the watched pages in the range.
3544 */
3545 static void
3546 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3547 {
3548 struct watched_page *pwp;
3549 struct watched_page tpw;
3550 caddr_t eaddr = addr + size;
3551 caddr_t vaddr;
3552 struct seg *seg;
3553 int err, retrycnt;
3554 uint_t wprot;
3555 avl_index_t where;
3556
3557 if (avl_numnodes(&as->a_wpage) == 0)
3558 return;
3559
3560 ASSERT(AS_WRITE_HELD(as));
3561
3562 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3563 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3564 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3565
3566 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3567 retrycnt = 0;
3568 vaddr = pwp->wp_vaddr;
3569
3570 wprot = prot;
3571 if (pwp->wp_read)
3572 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3573 if (pwp->wp_write)
3574 wprot &= ~PROT_WRITE;
3575 if (pwp->wp_exec)
3576 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3577 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3578 retry:
3579 seg = as_segat(as, vaddr);
3580 if (seg == NULL) {
3599 * Clear all of the watched pages in the range.
3600 */
3601 static void
3602 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3603 {
3604 caddr_t eaddr = addr + size;
3605 struct watched_page *pwp;
3606 struct watched_page tpw;
3607 uint_t prot;
3608 struct seg *seg;
3609 int err, retrycnt;
3610 avl_index_t where;
3611
3612 if (avl_numnodes(&as->a_wpage) == 0)
3613 return;
3614
3615 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3616 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3617 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3618
3619 ASSERT(AS_WRITE_HELD(as));
3620
3621 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3622
3623 if ((prot = pwp->wp_oprot) != 0) {
3624 retrycnt = 0;
3625
3626 if (prot != pwp->wp_prot) {
3627 retry:
3628 seg = as_segat(as, pwp->wp_vaddr);
3629 if (seg == NULL)
3630 continue;
3631 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3632 PAGESIZE, prot);
3633 if (err == IE_RETRY) {
3634 ASSERT(retrycnt == 0);
3635 retrycnt++;
3636 goto retry;
3637
3638 }
3639 }
3654 for (p = practive; p; p = p->p_next) {
3655 if (p->p_as == as) {
3656 mutex_enter(&p->p_lock);
3657 if (p->p_as == as)
3658 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3659 mutex_exit(&p->p_lock);
3660 }
3661 }
3662 mutex_exit(&pidlock);
3663 }
3664
3665 /*
3666 * return memory object ID
3667 */
3668 int
3669 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3670 {
3671 struct seg *seg;
3672 int sts;
3673
3674 AS_LOCK_ENTER(as, RW_READER);
3675 seg = as_segat(as, addr);
3676 if (seg == NULL) {
3677 AS_LOCK_EXIT(as);
3678 return (EFAULT);
3679 }
3680 /*
3681 * catch old drivers which may not support getmemid
3682 */
3683 if (seg->s_ops->getmemid == NULL) {
3684 AS_LOCK_EXIT(as);
3685 return (ENODEV);
3686 }
3687
3688 sts = SEGOP_GETMEMID(seg, addr, memidp);
3689
3690 AS_LOCK_EXIT(as);
3691 return (sts);
3692 }
|