Print this page
XXXX pass in cpu_pause_func via pause_cpus
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/os/mem_config.c
+++ new/usr/src/uts/common/os/mem_config.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 25
26 26 #include <sys/types.h>
27 27 #include <sys/cmn_err.h>
28 28 #include <sys/vmem.h>
29 29 #include <sys/kmem.h>
30 30 #include <sys/systm.h>
31 31 #include <sys/machsystm.h> /* for page_freelist_coalesce() */
32 32 #include <sys/errno.h>
33 33 #include <sys/memnode.h>
34 34 #include <sys/memlist.h>
35 35 #include <sys/memlist_impl.h>
36 36 #include <sys/tuneable.h>
37 37 #include <sys/proc.h>
38 38 #include <sys/disp.h>
39 39 #include <sys/debug.h>
40 40 #include <sys/vm.h>
41 41 #include <sys/callb.h>
42 42 #include <sys/memlist_plat.h> /* for installed_top_size() */
43 43 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */
44 44 #include <sys/dumphdr.h> /* for dump_resize() */
45 45 #include <sys/atomic.h> /* for use in stats collection */
46 46 #include <sys/rwlock.h>
47 47 #include <sys/cpuvar.h>
48 48 #include <vm/seg_kmem.h>
49 49 #include <vm/seg_kpm.h>
50 50 #include <vm/page.h>
51 51 #include <vm/vm_dep.h>
52 52 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */
53 53 #include <sys/sunddi.h>
54 54 #include <sys/mem_config.h>
55 55 #include <sys/mem_cage.h>
56 56 #include <sys/lgrp.h>
57 57 #include <sys/ddi.h>
58 58 #include <sys/modctl.h>
59 59
60 60 extern struct memlist *phys_avail;
61 61
62 62 extern uint_t page_ctrs_adjust(int);
63 63 void page_ctrs_cleanup(void);
64 64 static void kphysm_setup_post_add(pgcnt_t);
65 65 static int kphysm_setup_pre_del(pgcnt_t);
66 66 static void kphysm_setup_post_del(pgcnt_t, int);
67 67
68 68 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
69 69
70 70 static int delspan_reserve(pfn_t, pgcnt_t);
71 71 static void delspan_unreserve(pfn_t, pgcnt_t);
72 72
73 73 kmutex_t memseg_lists_lock;
74 74 struct memseg *memseg_va_avail;
75 75 struct memseg *memseg_alloc(void);
76 76 static struct memseg *memseg_delete_junk;
77 77 static struct memseg *memseg_edit_junk;
78 78 void memseg_remap_init(void);
79 79 static void memseg_remap_to_dummy(struct memseg *);
80 80 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
81 81 static struct memseg *memseg_reuse(pgcnt_t);
82 82
83 83 static struct kmem_cache *memseg_cache;
84 84
85 85 /*
86 86 * Interfaces to manage externally allocated
87 87 * page_t memory (metadata) for a memseg.
88 88 */
89 89 #pragma weak memseg_alloc_meta
90 90 #pragma weak memseg_free_meta
91 91 #pragma weak memseg_get_metapfn
92 92 #pragma weak memseg_remap_meta
93 93
94 94 extern int ppvm_enable;
95 95 extern page_t *ppvm_base;
96 96 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
97 97 extern void memseg_free_meta(void *, pgcnt_t);
98 98 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
99 99 extern void memseg_remap_meta(struct memseg *);
100 100 static int memseg_is_dynamic(struct memseg *);
101 101 static int memseg_includes_meta(struct memseg *);
102 102 pfn_t memseg_get_start(struct memseg *);
103 103 static void memseg_cpu_vm_flush(void);
104 104
105 105 int meta_alloc_enable;
106 106
107 107 #ifdef DEBUG
108 108 static int memseg_debug;
109 109 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
110 110 #else
111 111 #define MEMSEG_DEBUG(...)
112 112 #endif
113 113
114 114 /*
115 115 * Add a chunk of memory to the system.
116 116 * base: starting PAGESIZE page of new memory.
117 117 * npgs: length in PAGESIZE pages.
118 118 *
119 119 * Adding mem this way doesn't increase the size of the hash tables;
120 120 * growing them would be too hard. This should be OK, but adding memory
121 121 * dynamically most likely means more hash misses, since the tables will
122 122 * be smaller than they otherwise would be.
123 123 */
124 124 int
125 125 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
126 126 {
127 127 page_t *pp;
128 128 page_t *opp, *oepp, *segpp;
129 129 struct memseg *seg;
130 130 uint64_t avmem;
131 131 pfn_t pfn;
132 132 pfn_t pt_base = base;
133 133 pgcnt_t tpgs = npgs;
134 134 pgcnt_t metapgs = 0;
135 135 int exhausted;
136 136 pfn_t pnum;
137 137 int mnode;
138 138 caddr_t vaddr;
139 139 int reuse;
140 140 int mlret;
141 141 int rv;
142 142 int flags;
143 143 int meta_alloc = 0;
144 144 void *mapva;
145 145 void *metabase = (void *)base;
146 146 pgcnt_t nkpmpgs = 0;
147 147 offset_t kpm_pages_off;
148 148
149 149 cmn_err(CE_CONT,
150 150 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
151 151 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
152 152
153 153 /*
154 154 * Add this span in the delete list to prevent interactions.
155 155 */
156 156 if (!delspan_reserve(base, npgs)) {
157 157 return (KPHYSM_ESPAN);
158 158 }
159 159 /*
160 160 * Check to see if any of the memory span has been added
161 161 * by trying an add to the installed memory list. This
162 162 * forms the interlocking process for add.
163 163 */
164 164
165 165 memlist_write_lock();
166 166
167 167 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
168 168 (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
169 169
170 170 if (mlret == MEML_SPANOP_OK)
171 171 installed_top_size(phys_install, &physmax, &physinstalled);
172 172
173 173 memlist_write_unlock();
174 174
175 175 if (mlret != MEML_SPANOP_OK) {
176 176 if (mlret == MEML_SPANOP_EALLOC) {
177 177 delspan_unreserve(pt_base, tpgs);
178 178 return (KPHYSM_ERESOURCE);
179 179 } else if (mlret == MEML_SPANOP_ESPAN) {
180 180 delspan_unreserve(pt_base, tpgs);
181 181 return (KPHYSM_ESPAN);
182 182 } else {
183 183 delspan_unreserve(pt_base, tpgs);
184 184 return (KPHYSM_ERESOURCE);
185 185 }
186 186 }
187 187
188 188 if (meta_alloc_enable) {
189 189 /*
190 190 * Allocate the page_t's from existing memory;
191 191 * if that fails, allocate from the incoming memory.
192 192 */
193 193 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
194 194 if (rv == KPHYSM_OK) {
195 195 ASSERT(metapgs);
196 196 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
197 197 meta_alloc = 1;
198 198 goto mapalloc;
199 199 }
200 200 }
201 201
202 202 /*
203 203 * We store the page_t's for this new memory in the first
204 204 * few pages of the chunk. Here, we go and get'em ...
205 205 */
206 206
207 207 /*
208 208 * The expression after the '-' gives the number of pages
209 209 * that will fit in the new memory based on a requirement
210 210 * of (PAGESIZE + sizeof (page_t)) bytes per page.
211 211 */
212 212 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
213 213 (PAGESIZE + sizeof (page_t)));
214 214
215 215 npgs -= metapgs;
216 216 base += metapgs;
217 217
218 218 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
219 219
220 220 exhausted = (metapgs == 0 || npgs == 0);
221 221
222 222 if (kpm_enable && !exhausted) {
223 223 pgcnt_t start, end, nkpmpgs_prelim;
224 224 size_t ptsz;
225 225
226 226 /*
227 227 * A viable kpm large page mapping must not overlap two
228 228 * dynamic memsegs. Therefore the total size is checked
229 229 * to be at least kpm_pgsz and also whether start and end
230 230 * points are at least kpm_pgsz aligned.
231 231 */
232 232 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
233 233 pmodkpmp(base + npgs)) {
234 234
235 235 kphysm_addmem_error_undospan(pt_base, tpgs);
236 236
237 237 /*
238 238 * There is no specific error code for violating
239 239 * kpm granularity constraints.
240 240 */
241 241 return (KPHYSM_ENOTVIABLE);
242 242 }
243 243
244 244 start = kpmptop(ptokpmp(base));
245 245 end = kpmptop(ptokpmp(base + npgs));
246 246 nkpmpgs_prelim = ptokpmp(end - start);
247 247 ptsz = npgs * sizeof (page_t);
248 248 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
249 249 exhausted = (tpgs <= metapgs);
250 250 if (!exhausted) {
251 251 npgs = tpgs - metapgs;
252 252 base = pt_base + metapgs;
253 253
254 254 /* final nkpmpgs */
255 255 start = kpmptop(ptokpmp(base));
256 256 nkpmpgs = ptokpmp(end - start);
257 257 kpm_pages_off = ptsz +
258 258 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
259 259 }
260 260 }
261 261
262 262 /*
263 263 * Is memory area supplied too small?
264 264 */
265 265 if (exhausted) {
266 266 kphysm_addmem_error_undospan(pt_base, tpgs);
267 267 /*
268 268 * There is no specific error code for 'too small'.
269 269 */
270 270 return (KPHYSM_ERESOURCE);
271 271 }
272 272
273 273 mapalloc:
274 274 /*
275 275 * We may re-use a previously allocated VA space for the page_ts
276 276 * eventually, but we need to initialize and lock the pages first.
277 277 */
278 278
279 279 /*
280 280 * Get an address in the kernel address map, map
281 281 * the page_t pages and see if we can touch them.
282 282 */
283 283
284 284 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
285 285 if (mapva == NULL) {
286 286 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
287 287 " Can't allocate VA for page_ts");
288 288
289 289 if (meta_alloc)
290 290 memseg_free_meta(metabase, metapgs);
291 291 kphysm_addmem_error_undospan(pt_base, tpgs);
292 292
293 293 return (KPHYSM_ERESOURCE);
294 294 }
295 295 pp = mapva;
296 296
297 297 if (physmax < (pt_base + tpgs))
298 298 physmax = (pt_base + tpgs);
299 299
300 300 /*
301 301 * In the remapping code we map one page at a time so we must do
302 302 * the same here to match mapping sizes.
303 303 */
304 304 pfn = pt_base;
305 305 vaddr = (caddr_t)pp;
306 306 for (pnum = 0; pnum < metapgs; pnum++) {
307 307 if (meta_alloc)
308 308 pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
309 309 hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
310 310 PROT_READ | PROT_WRITE,
311 311 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
312 312 pfn++;
313 313 vaddr += ptob(1);
314 314 }
315 315
316 316 if (ddi_peek32((dev_info_t *)NULL,
317 317 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
318 318
319 319 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
320 320 " Can't access pp array at 0x%p [phys 0x%lx]",
321 321 (void *)pp, pt_base);
322 322
323 323 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
324 324 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
325 325
326 326 vmem_free(heap_arena, mapva, ptob(metapgs));
327 327 if (meta_alloc)
328 328 memseg_free_meta(metabase, metapgs);
329 329 kphysm_addmem_error_undospan(pt_base, tpgs);
330 330
331 331 return (KPHYSM_EFAULT);
332 332 }
333 333
334 334 /*
335 335 * Add this memory slice to its memory node translation.
336 336 *
337 337 * Note that right now, each node may have only one slice;
338 338 * this may change with COD or in larger SSM systems with
339 339 * nested latency groups, so we must not assume that the
340 340 * node does not yet exist.
341 341 *
342 342 * Note that there may be multiple memory nodes associated with
343 343 * a single lgrp node on x86 systems.
344 344 */
345 345 pnum = pt_base + tpgs - 1;
346 346 mem_node_add_range(pt_base, pnum);
347 347
348 348 /*
349 349 * Allocate or resize page counters as necessary to accommodate
350 350 * the increase in memory pages.
351 351 */
352 352 mnode = PFN_2_MEM_NODE(pnum);
353 353 PAGE_CTRS_ADJUST(base, npgs, rv);
354 354 if (rv) {
355 355
356 356 mem_node_del_range(pt_base, pnum);
357 357
358 358 /* cleanup the page counters */
359 359 page_ctrs_cleanup();
360 360
361 361 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
362 362 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
363 363
364 364 vmem_free(heap_arena, mapva, ptob(metapgs));
365 365 if (meta_alloc)
366 366 memseg_free_meta(metabase, metapgs);
367 367 kphysm_addmem_error_undospan(pt_base, tpgs);
368 368
369 369 return (KPHYSM_ERESOURCE);
370 370 }
371 371
372 372 /*
373 373 * Update the phys_avail memory list.
374 374 * The phys_install list was done at the start.
375 375 */
376 376
377 377 memlist_write_lock();
378 378
379 379 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
380 380 (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
381 381 ASSERT(mlret == MEML_SPANOP_OK);
382 382
383 383 memlist_write_unlock();
384 384
385 385 /* See if we can find a memseg to re-use. */
386 386 if (meta_alloc) {
387 387 seg = memseg_reuse(0);
388 388 reuse = 1; /* force unmapping of temp mapva */
389 389 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
390 390 /*
391 391 * There is a 1:1 fixed relationship between a pfn
392 392 * and a page_t VA. The pfn is used as an index into
393 393 * the ppvm_base page_t table in order to calculate
394 394 * the page_t base address for a given pfn range.
395 395 */
396 396 segpp = ppvm_base + base;
397 397 } else {
398 398 seg = memseg_reuse(metapgs);
399 399 reuse = (seg != NULL);
400 400 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
401 401 segpp = pp;
402 402 }
403 403
404 404 /*
405 405 * Initialize the memseg structure representing this memory
406 406 * and add it to the existing list of memsegs. Do some basic
407 407 * initialization and add the memory to the system.
408 408 * In order to prevent lock deadlocks, the add_physmem()
409 409 * code is repeated here, but split into several stages.
410 410 *
411 411 * If a memseg is reused, invalidate memseg pointers in
412 412 * all cpu vm caches. We need to do this this since the check
413 413 * pp >= seg->pages && pp < seg->epages
414 414 * used in various places is not atomic and so the first compare
415 415 * can happen before reuse and the second compare after reuse.
416 416 * The invalidation ensures that a memseg is not deferenced while
417 417 * it's page/pfn pointers are changing.
418 418 */
419 419 if (seg == NULL) {
420 420 seg = memseg_alloc();
421 421 ASSERT(seg != NULL);
422 422 seg->msegflags = flags;
423 423 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
424 424 (void *)seg, (void *)(seg->pages));
425 425 seg->pages = segpp;
426 426 } else {
427 427 ASSERT(seg->msegflags == flags);
428 428 ASSERT(seg->pages_base == seg->pages_end);
429 429 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
430 430 (void *)seg, (void *)(seg->pages));
431 431 if (meta_alloc) {
432 432 memseg_cpu_vm_flush();
433 433 seg->pages = segpp;
434 434 }
435 435 }
436 436
437 437 seg->epages = seg->pages + npgs;
438 438 seg->pages_base = base;
439 439 seg->pages_end = base + npgs;
440 440
441 441 /*
442 442 * Initialize metadata. The page_ts are set to locked state
443 443 * ready to be freed.
444 444 */
445 445 bzero((caddr_t)pp, ptob(metapgs));
446 446
447 447 pfn = seg->pages_base;
448 448 /* Save the original pp base in case we reuse a memseg. */
449 449 opp = pp;
450 450 oepp = opp + npgs;
451 451 for (pp = opp; pp < oepp; pp++) {
452 452 pp->p_pagenum = pfn;
453 453 pfn++;
454 454 page_iolock_init(pp);
455 455 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
456 456 continue;
457 457 pp->p_offset = (u_offset_t)-1;
458 458 }
459 459
460 460 if (reuse) {
461 461 /* Remap our page_ts to the re-used memseg VA space. */
462 462 pfn = pt_base;
463 463 vaddr = (caddr_t)seg->pages;
464 464 for (pnum = 0; pnum < metapgs; pnum++) {
465 465 if (meta_alloc)
466 466 pfn = memseg_get_metapfn(metabase,
467 467 (pgcnt_t)pnum);
468 468 hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
469 469 PROT_READ | PROT_WRITE,
470 470 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
471 471 pfn++;
472 472 vaddr += ptob(1);
473 473 }
474 474
475 475 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
476 476 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
477 477
478 478 vmem_free(heap_arena, mapva, ptob(metapgs));
479 479 }
480 480
481 481 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
482 482
483 483 memsegs_lock(1);
484 484
485 485 /*
486 486 * The new memseg is inserted at the beginning of the list.
487 487 * Not only does this save searching for the tail, but in the
488 488 * case of a re-used memseg, it solves the problem of what
489 489 * happens if some process has still got a pointer to the
490 490 * memseg and follows the next pointer to continue traversing
491 491 * the memsegs list.
492 492 */
493 493
494 494 hat_kpm_addmem_mseg_insert(seg);
495 495
496 496 seg->next = memsegs;
497 497 membar_producer();
498 498
499 499 hat_kpm_addmem_memsegs_update(seg);
500 500
501 501 memsegs = seg;
502 502
503 503 build_pfn_hash();
504 504
505 505 total_pages += npgs;
506 506
507 507 /*
508 508 * Recalculate the paging parameters now total_pages has changed.
509 509 * This will also cause the clock hands to be reset before next use.
510 510 */
511 511 setupclock(1);
512 512
513 513 memsegs_unlock(1);
514 514
515 515 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
516 516
517 517 /*
518 518 * Free the pages outside the lock to avoid locking loops.
519 519 */
520 520 for (pp = seg->pages; pp < seg->epages; pp++) {
521 521 page_free(pp, 1);
522 522 }
523 523
524 524 /*
525 525 * Now that we've updated the appropriate memory lists we
526 526 * need to reset a number of globals, since we've increased memory.
527 527 * Several have already been updated for us as noted above. The
528 528 * globals we're interested in at this point are:
529 529 * physmax - highest page frame number.
530 530 * physinstalled - number of pages currently installed (done earlier)
531 531 * maxmem - max free pages in the system
532 532 * physmem - physical memory pages available
533 533 * availrmem - real memory available
534 534 */
535 535
536 536 mutex_enter(&freemem_lock);
537 537 maxmem += npgs;
538 538 physmem += npgs;
539 539 availrmem += npgs;
540 540 availrmem_initial += npgs;
541 541
542 542 mutex_exit(&freemem_lock);
543 543
544 544 dump_resize();
545 545
546 546 page_freelist_coalesce_all(mnode);
547 547
548 548 kphysm_setup_post_add(npgs);
549 549
550 550 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
551 551 "(0x%" PRIx64 ")\n",
552 552 physinstalled << (PAGESHIFT - 10),
553 553 (uint64_t)physinstalled << PAGESHIFT);
554 554
555 555 avmem = (uint64_t)freemem << PAGESHIFT;
556 556 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
557 557 "avail mem = %" PRId64 "\n", avmem);
558 558
559 559 /*
560 560 * Update lgroup generation number on single lgroup systems
561 561 */
562 562 if (nlgrps == 1)
563 563 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
564 564
565 565 /*
566 566 * Inform DDI of update
567 567 */
568 568 ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
569 569 (uint64_t)(tpgs) << PAGESHIFT);
570 570
571 571 delspan_unreserve(pt_base, tpgs);
572 572
573 573 return (KPHYSM_OK); /* Successfully added system memory */
574 574 }
575 575
576 576 /*
577 577 * There are various error conditions in kphysm_add_memory_dynamic()
578 578 * which require a rollback of already changed global state.
579 579 */
580 580 static void
581 581 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
582 582 {
583 583 int mlret;
584 584
585 585 /* Unreserve memory span. */
586 586 memlist_write_lock();
587 587
588 588 mlret = memlist_delete_span(
589 589 (uint64_t)(pt_base) << PAGESHIFT,
590 590 (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
591 591
592 592 ASSERT(mlret == MEML_SPANOP_OK);
593 593 phys_install_has_changed();
594 594 installed_top_size(phys_install, &physmax, &physinstalled);
595 595
596 596 memlist_write_unlock();
597 597 delspan_unreserve(pt_base, tpgs);
598 598 }
599 599
600 600 /*
601 601 * Only return an available memseg of exactly the right size
602 602 * if size is required.
603 603 * When the meta data area has it's own virtual address space
604 604 * we will need to manage this more carefully and do best fit
605 605 * allocations, possibly splitting an available area.
606 606 */
607 607 struct memseg *
608 608 memseg_reuse(pgcnt_t metapgs)
609 609 {
610 610 int type;
611 611 struct memseg **segpp, *seg;
612 612
613 613 mutex_enter(&memseg_lists_lock);
614 614
615 615 segpp = &memseg_va_avail;
616 616 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
617 617 caddr_t end;
618 618
619 619 /*
620 620 * Make sure we are reusing the right segment type.
621 621 */
622 622 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
623 623
624 624 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
625 625 != type)
626 626 continue;
627 627
628 628 if (kpm_enable)
629 629 end = hat_kpm_mseg_reuse(seg);
630 630 else
631 631 end = (caddr_t)seg->epages;
632 632
633 633 /*
634 634 * Check for the right size if it is provided.
635 635 */
636 636 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
637 637 *segpp = seg->lnext;
638 638 seg->lnext = NULL;
639 639 break;
640 640 }
641 641 }
642 642 mutex_exit(&memseg_lists_lock);
643 643
644 644 return (seg);
645 645 }
646 646
647 647 static uint_t handle_gen;
648 648
649 649 struct memdelspan {
650 650 struct memdelspan *mds_next;
651 651 pfn_t mds_base;
652 652 pgcnt_t mds_npgs;
653 653 uint_t *mds_bitmap;
654 654 uint_t *mds_bitmap_retired;
655 655 };
656 656
657 657 #define NBPBMW (sizeof (uint_t) * NBBY)
658 658 #define MDS_BITMAPBYTES(MDSP) \
659 659 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
660 660
661 661 struct transit_list {
662 662 struct transit_list *trl_next;
663 663 struct memdelspan *trl_spans;
664 664 int trl_collect;
665 665 };
666 666
667 667 struct transit_list_head {
668 668 kmutex_t trh_lock;
669 669 struct transit_list *trh_head;
670 670 };
671 671
672 672 static struct transit_list_head transit_list_head;
673 673
674 674 struct mem_handle;
675 675 static void transit_list_collect(struct mem_handle *, int);
676 676 static void transit_list_insert(struct transit_list *);
677 677 static void transit_list_remove(struct transit_list *);
678 678
679 679 #ifdef DEBUG
680 680 #define MEM_DEL_STATS
681 681 #endif /* DEBUG */
682 682
683 683 #ifdef MEM_DEL_STATS
684 684 static int mem_del_stat_print = 0;
685 685 struct mem_del_stat {
686 686 uint_t nloop;
687 687 uint_t need_free;
688 688 uint_t free_loop;
689 689 uint_t free_low;
690 690 uint_t free_failed;
691 691 uint_t ncheck;
692 692 uint_t nopaget;
693 693 uint_t lockfail;
694 694 uint_t nfree;
695 695 uint_t nreloc;
696 696 uint_t nrelocfail;
697 697 uint_t already_done;
698 698 uint_t first_notfree;
699 699 uint_t npplocked;
700 700 uint_t nlockreloc;
701 701 uint_t nnorepl;
702 702 uint_t nmodreloc;
703 703 uint_t ndestroy;
704 704 uint_t nputpage;
705 705 uint_t nnoreclaim;
706 706 uint_t ndelay;
707 707 uint_t demotefail;
708 708 uint64_t nticks_total;
709 709 uint64_t nticks_pgrp;
710 710 uint_t retired;
711 711 uint_t toxic;
712 712 uint_t failing;
713 713 uint_t modtoxic;
714 714 uint_t npplkdtoxic;
715 715 uint_t gptlmodfail;
716 716 uint_t gptllckfail;
717 717 };
718 718 /*
719 719 * The stat values are only incremented in the delete thread
720 720 * so no locking or atomic required.
721 721 */
722 722 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++
723 723 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck))
724 724 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck))
725 725 static void mem_del_stat_print_func(struct mem_handle *);
726 726 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP))
727 727 #else /* MEM_DEL_STATS */
728 728 #define MDSTAT_INCR(MHP, FLD)
729 729 #define MDSTAT_TOTAL(MHP, ntck)
730 730 #define MDSTAT_PGRP(MHP, ntck)
731 731 #define MDSTAT_PRINT(MHP)
732 732 #endif /* MEM_DEL_STATS */
733 733
734 734 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
735 735 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
736 736
737 737 /*
738 738 * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
739 739 * The mutex may not be required for other fields, dependent on mh_state.
740 740 */
741 741 struct mem_handle {
742 742 kmutex_t mh_mutex;
743 743 struct mem_handle *mh_next;
744 744 memhandle_t mh_exthandle;
745 745 mhnd_state_t mh_state;
746 746 struct transit_list mh_transit;
747 747 pgcnt_t mh_phys_pages;
748 748 pgcnt_t mh_vm_pages;
749 749 pgcnt_t mh_hold_todo;
750 750 void (*mh_delete_complete)(void *, int error);
751 751 void *mh_delete_complete_arg;
752 752 volatile uint_t mh_cancel;
753 753 volatile uint_t mh_dr_aio_cleanup_cancel;
754 754 volatile uint_t mh_aio_cleanup_done;
755 755 kcondvar_t mh_cv;
756 756 kthread_id_t mh_thread_id;
757 757 page_t *mh_deleted; /* link through p_next */
758 758 #ifdef MEM_DEL_STATS
759 759 struct mem_del_stat mh_delstat;
760 760 #endif /* MEM_DEL_STATS */
761 761 };
762 762
763 763 static struct mem_handle *mem_handle_head;
764 764 static kmutex_t mem_handle_list_mutex;
765 765
766 766 static struct mem_handle *
767 767 kphysm_allocate_mem_handle()
768 768 {
769 769 struct mem_handle *mhp;
770 770
771 771 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
772 772 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
773 773 mutex_enter(&mem_handle_list_mutex);
774 774 mutex_enter(&mhp->mh_mutex);
775 775 /* handle_gen is protected by list mutex. */
776 776 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
777 777 mhp->mh_next = mem_handle_head;
778 778 mem_handle_head = mhp;
779 779 mutex_exit(&mem_handle_list_mutex);
780 780
781 781 return (mhp);
782 782 }
783 783
784 784 static void
785 785 kphysm_free_mem_handle(struct mem_handle *mhp)
786 786 {
787 787 struct mem_handle **mhpp;
788 788
789 789 ASSERT(mutex_owned(&mhp->mh_mutex));
790 790 ASSERT(mhp->mh_state == MHND_FREE);
791 791 /*
792 792 * Exit the mutex to preserve locking order. This is OK
793 793 * here as once in the FREE state, the handle cannot
794 794 * be found by a lookup.
795 795 */
796 796 mutex_exit(&mhp->mh_mutex);
797 797
798 798 mutex_enter(&mem_handle_list_mutex);
799 799 mhpp = &mem_handle_head;
800 800 while (*mhpp != NULL && *mhpp != mhp)
801 801 mhpp = &(*mhpp)->mh_next;
802 802 ASSERT(*mhpp == mhp);
803 803 /*
804 804 * No need to lock the handle (mh_mutex) as only
805 805 * mh_next changing and this is the only thread that
806 806 * can be referncing mhp.
807 807 */
808 808 *mhpp = mhp->mh_next;
809 809 mutex_exit(&mem_handle_list_mutex);
810 810
811 811 mutex_destroy(&mhp->mh_mutex);
812 812 kmem_free(mhp, sizeof (struct mem_handle));
813 813 }
814 814
815 815 /*
816 816 * This function finds the internal mem_handle corresponding to an
817 817 * external handle and returns it with the mh_mutex held.
818 818 */
819 819 static struct mem_handle *
820 820 kphysm_lookup_mem_handle(memhandle_t handle)
821 821 {
822 822 struct mem_handle *mhp;
823 823
824 824 mutex_enter(&mem_handle_list_mutex);
825 825 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
826 826 if (mhp->mh_exthandle == handle) {
827 827 mutex_enter(&mhp->mh_mutex);
828 828 /*
829 829 * The state of the handle could have been changed
830 830 * by kphysm_del_release() while waiting for mh_mutex.
831 831 */
832 832 if (mhp->mh_state == MHND_FREE) {
833 833 mutex_exit(&mhp->mh_mutex);
834 834 continue;
835 835 }
836 836 break;
837 837 }
838 838 }
839 839 mutex_exit(&mem_handle_list_mutex);
840 840 return (mhp);
841 841 }
842 842
843 843 int
844 844 kphysm_del_gethandle(memhandle_t *xmhp)
845 845 {
846 846 struct mem_handle *mhp;
847 847
848 848 mhp = kphysm_allocate_mem_handle();
849 849 /*
850 850 * The handle is allocated using KM_SLEEP, so cannot fail.
851 851 * If the implementation is changed, the correct error to return
852 852 * here would be KPHYSM_ENOHANDLES.
853 853 */
854 854 ASSERT(mhp->mh_state == MHND_FREE);
855 855 mhp->mh_state = MHND_INIT;
856 856 *xmhp = mhp->mh_exthandle;
857 857 mutex_exit(&mhp->mh_mutex);
858 858 return (KPHYSM_OK);
859 859 }
860 860
861 861 static int
862 862 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
863 863 {
864 864 pfn_t e1, e2;
865 865
866 866 e1 = b1 + l1;
867 867 e2 = b2 + l2;
868 868
869 869 return (!(b2 >= e1 || b1 >= e2));
870 870 }
871 871
872 872 static int can_remove_pgs(pgcnt_t);
873 873
874 874 static struct memdelspan *
875 875 span_to_install(pfn_t base, pgcnt_t npgs)
876 876 {
877 877 struct memdelspan *mdsp;
878 878 struct memdelspan *mdsp_new;
879 879 uint64_t address, size, thislen;
880 880 struct memlist *mlp;
881 881
882 882 mdsp_new = NULL;
883 883
884 884 address = (uint64_t)base << PAGESHIFT;
885 885 size = (uint64_t)npgs << PAGESHIFT;
886 886 while (size != 0) {
887 887 memlist_read_lock();
888 888 for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
889 889 if (address >= (mlp->ml_address + mlp->ml_size))
890 890 continue;
891 891 if ((address + size) > mlp->ml_address)
892 892 break;
893 893 }
894 894 if (mlp == NULL) {
895 895 address += size;
896 896 size = 0;
897 897 thislen = 0;
898 898 } else {
899 899 if (address < mlp->ml_address) {
900 900 size -= (mlp->ml_address - address);
901 901 address = mlp->ml_address;
902 902 }
903 903 ASSERT(address >= mlp->ml_address);
904 904 if ((address + size) >
905 905 (mlp->ml_address + mlp->ml_size)) {
906 906 thislen =
907 907 mlp->ml_size - (address - mlp->ml_address);
908 908 } else {
909 909 thislen = size;
910 910 }
911 911 }
912 912 memlist_read_unlock();
913 913 /* TODO: phys_install could change now */
914 914 if (thislen == 0)
915 915 continue;
916 916 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
917 917 mdsp->mds_base = btop(address);
918 918 mdsp->mds_npgs = btop(thislen);
919 919 mdsp->mds_next = mdsp_new;
920 920 mdsp_new = mdsp;
921 921 address += thislen;
922 922 size -= thislen;
923 923 }
924 924 return (mdsp_new);
925 925 }
926 926
927 927 static void
928 928 free_delspans(struct memdelspan *mdsp)
929 929 {
930 930 struct memdelspan *amdsp;
931 931
932 932 while ((amdsp = mdsp) != NULL) {
933 933 mdsp = amdsp->mds_next;
934 934 kmem_free(amdsp, sizeof (struct memdelspan));
935 935 }
936 936 }
937 937
938 938 /*
939 939 * Concatenate lists. No list ordering is required.
940 940 */
941 941
942 942 static void
943 943 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
944 944 {
945 945 while (*mdspp != NULL)
946 946 mdspp = &(*mdspp)->mds_next;
947 947
948 948 *mdspp = mdsp;
949 949 }
950 950
951 951 /*
952 952 * Given a new list of delspans, check there is no overlap with
953 953 * all existing span activity (add or delete) and then concatenate
954 954 * the new spans to the given list.
955 955 * Return 1 for OK, 0 if overlapping.
956 956 */
957 957 static int
958 958 delspan_insert(
959 959 struct transit_list *my_tlp,
960 960 struct memdelspan *mdsp_new)
961 961 {
962 962 struct transit_list_head *trh;
963 963 struct transit_list *tlp;
964 964 int ret;
965 965
966 966 trh = &transit_list_head;
967 967
968 968 ASSERT(my_tlp != NULL);
969 969 ASSERT(mdsp_new != NULL);
970 970
971 971 ret = 1;
972 972 mutex_enter(&trh->trh_lock);
973 973 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
974 974 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
975 975 struct memdelspan *mdsp;
976 976
977 977 for (mdsp = tlp->trl_spans; mdsp != NULL;
978 978 mdsp = mdsp->mds_next) {
979 979 struct memdelspan *nmdsp;
980 980
981 981 for (nmdsp = mdsp_new; nmdsp != NULL;
982 982 nmdsp = nmdsp->mds_next) {
983 983 if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
984 984 nmdsp->mds_base, nmdsp->mds_npgs)) {
985 985 ret = 0;
986 986 goto done;
987 987 }
988 988 }
989 989 }
990 990 }
991 991 done:
992 992 if (ret != 0) {
993 993 if (my_tlp->trl_spans == NULL)
994 994 transit_list_insert(my_tlp);
995 995 delspan_concat(&my_tlp->trl_spans, mdsp_new);
996 996 }
997 997 mutex_exit(&trh->trh_lock);
998 998 return (ret);
999 999 }
1000 1000
1001 1001 static void
1002 1002 delspan_remove(
1003 1003 struct transit_list *my_tlp,
1004 1004 pfn_t base,
1005 1005 pgcnt_t npgs)
1006 1006 {
1007 1007 struct transit_list_head *trh;
1008 1008 struct memdelspan *mdsp;
1009 1009
1010 1010 trh = &transit_list_head;
1011 1011
1012 1012 ASSERT(my_tlp != NULL);
1013 1013
1014 1014 mutex_enter(&trh->trh_lock);
1015 1015 if ((mdsp = my_tlp->trl_spans) != NULL) {
1016 1016 if (npgs == 0) {
1017 1017 my_tlp->trl_spans = NULL;
1018 1018 free_delspans(mdsp);
1019 1019 transit_list_remove(my_tlp);
1020 1020 } else {
1021 1021 struct memdelspan **prv;
1022 1022
1023 1023 prv = &my_tlp->trl_spans;
1024 1024 while (mdsp != NULL) {
1025 1025 pfn_t p_end;
1026 1026
1027 1027 p_end = mdsp->mds_base + mdsp->mds_npgs;
1028 1028 if (mdsp->mds_base >= base &&
1029 1029 p_end <= (base + npgs)) {
1030 1030 *prv = mdsp->mds_next;
1031 1031 mdsp->mds_next = NULL;
1032 1032 free_delspans(mdsp);
1033 1033 } else {
1034 1034 prv = &mdsp->mds_next;
1035 1035 }
1036 1036 mdsp = *prv;
1037 1037 }
1038 1038 if (my_tlp->trl_spans == NULL)
1039 1039 transit_list_remove(my_tlp);
1040 1040 }
1041 1041 }
1042 1042 mutex_exit(&trh->trh_lock);
1043 1043 }
1044 1044
1045 1045 /*
1046 1046 * Reserve interface for add to stop delete before add finished.
1047 1047 * This list is only accessed through the delspan_insert/remove
1048 1048 * functions and so is fully protected by the mutex in struct transit_list.
1049 1049 */
1050 1050
1051 1051 static struct transit_list reserve_transit;
1052 1052
1053 1053 static int
1054 1054 delspan_reserve(pfn_t base, pgcnt_t npgs)
1055 1055 {
1056 1056 struct memdelspan *mdsp;
1057 1057 int ret;
1058 1058
1059 1059 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
1060 1060 mdsp->mds_base = base;
1061 1061 mdsp->mds_npgs = npgs;
1062 1062 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
1063 1063 free_delspans(mdsp);
1064 1064 }
1065 1065 return (ret);
1066 1066 }
1067 1067
1068 1068 static void
1069 1069 delspan_unreserve(pfn_t base, pgcnt_t npgs)
1070 1070 {
1071 1071 delspan_remove(&reserve_transit, base, npgs);
1072 1072 }
1073 1073
1074 1074 /*
1075 1075 * Return whether memseg was created by kphysm_add_memory_dynamic().
1076 1076 */
1077 1077 static int
1078 1078 memseg_is_dynamic(struct memseg *seg)
1079 1079 {
1080 1080 return (seg->msegflags & MEMSEG_DYNAMIC);
1081 1081 }
1082 1082
1083 1083 int
1084 1084 kphysm_del_span(
1085 1085 memhandle_t handle,
1086 1086 pfn_t base,
1087 1087 pgcnt_t npgs)
1088 1088 {
1089 1089 struct mem_handle *mhp;
1090 1090 struct memseg *seg;
1091 1091 struct memdelspan *mdsp;
1092 1092 struct memdelspan *mdsp_new;
1093 1093 pgcnt_t phys_pages, vm_pages;
1094 1094 pfn_t p_end;
1095 1095 page_t *pp;
1096 1096 int ret;
1097 1097
1098 1098 mhp = kphysm_lookup_mem_handle(handle);
1099 1099 if (mhp == NULL) {
1100 1100 return (KPHYSM_EHANDLE);
1101 1101 }
1102 1102 if (mhp->mh_state != MHND_INIT) {
1103 1103 mutex_exit(&mhp->mh_mutex);
1104 1104 return (KPHYSM_ESEQUENCE);
1105 1105 }
1106 1106
1107 1107 /*
1108 1108 * Intersect the span with the installed memory list (phys_install).
1109 1109 */
1110 1110 mdsp_new = span_to_install(base, npgs);
1111 1111 if (mdsp_new == NULL) {
1112 1112 /*
1113 1113 * No physical memory in this range. Is this an
1114 1114 * error? If an attempt to start the delete is made
1115 1115 * for OK returns from del_span such as this, start will
1116 1116 * return an error.
1117 1117 * Could return KPHYSM_ENOWORK.
1118 1118 */
1119 1119 /*
1120 1120 * It is assumed that there are no error returns
1121 1121 * from span_to_install() due to kmem_alloc failure.
1122 1122 */
1123 1123 mutex_exit(&mhp->mh_mutex);
1124 1124 return (KPHYSM_OK);
1125 1125 }
1126 1126 /*
1127 1127 * Does this span overlap an existing span?
1128 1128 */
1129 1129 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
1130 1130 /*
1131 1131 * Differentiate between already on list for this handle
1132 1132 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
1133 1133 */
1134 1134 ret = KPHYSM_EBUSY;
1135 1135 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1136 1136 mdsp = mdsp->mds_next) {
1137 1137 if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
1138 1138 base, npgs)) {
1139 1139 ret = KPHYSM_EDUP;
1140 1140 break;
1141 1141 }
1142 1142 }
1143 1143 mutex_exit(&mhp->mh_mutex);
1144 1144 free_delspans(mdsp_new);
1145 1145 return (ret);
1146 1146 }
1147 1147 /*
1148 1148 * At this point the spans in mdsp_new have been inserted into the
1149 1149 * list of spans for this handle and thereby to the global list of
1150 1150 * spans being processed. Each of these spans must now be checked
1151 1151 * for relocatability. As a side-effect segments in the memseg list
1152 1152 * may be split.
1153 1153 *
1154 1154 * Note that mdsp_new can no longer be used as it is now part of
1155 1155 * a larger list. Select elements of this larger list based
1156 1156 * on base and npgs.
1157 1157 */
1158 1158 restart:
1159 1159 phys_pages = 0;
1160 1160 vm_pages = 0;
1161 1161 ret = KPHYSM_OK;
1162 1162 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1163 1163 mdsp = mdsp->mds_next) {
1164 1164 pgcnt_t pages_checked;
1165 1165
1166 1166 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
1167 1167 continue;
1168 1168 }
1169 1169 p_end = mdsp->mds_base + mdsp->mds_npgs;
1170 1170 /*
1171 1171 * The pages_checked count is a hack. All pages should be
1172 1172 * checked for relocatability. Those not covered by memsegs
1173 1173 * should be tested with arch_kphysm_del_span_ok().
1174 1174 */
1175 1175 pages_checked = 0;
1176 1176 for (seg = memsegs; seg; seg = seg->next) {
1177 1177 pfn_t mseg_start;
1178 1178
1179 1179 if (seg->pages_base >= p_end ||
1180 1180 seg->pages_end <= mdsp->mds_base) {
1181 1181 /* Span and memseg don't overlap. */
1182 1182 continue;
1183 1183 }
1184 1184 mseg_start = memseg_get_start(seg);
1185 1185 /* Check that segment is suitable for delete. */
1186 1186 if (memseg_includes_meta(seg)) {
1187 1187 /*
1188 1188 * Check that this segment is completely
1189 1189 * within the span.
1190 1190 */
1191 1191 if (mseg_start < mdsp->mds_base ||
1192 1192 seg->pages_end > p_end) {
1193 1193 ret = KPHYSM_EBUSY;
1194 1194 break;
1195 1195 }
1196 1196 pages_checked += seg->pages_end - mseg_start;
1197 1197 } else {
1198 1198 /*
1199 1199 * If this segment is larger than the span,
1200 1200 * try to split it. After the split, it
1201 1201 * is necessary to restart.
1202 1202 */
1203 1203 if (seg->pages_base < mdsp->mds_base ||
1204 1204 seg->pages_end > p_end) {
1205 1205 pfn_t abase;
1206 1206 pgcnt_t anpgs;
1207 1207 int s_ret;
1208 1208
1209 1209 /* Split required. */
1210 1210 if (mdsp->mds_base < seg->pages_base)
1211 1211 abase = seg->pages_base;
1212 1212 else
1213 1213 abase = mdsp->mds_base;
1214 1214 if (p_end > seg->pages_end)
1215 1215 anpgs = seg->pages_end - abase;
1216 1216 else
1217 1217 anpgs = p_end - abase;
1218 1218 s_ret = kphysm_split_memseg(abase,
1219 1219 anpgs);
1220 1220 if (s_ret == 0) {
1221 1221 /* Split failed. */
1222 1222 ret = KPHYSM_ERESOURCE;
1223 1223 break;
1224 1224 }
1225 1225 goto restart;
1226 1226 }
1227 1227 pages_checked +=
1228 1228 seg->pages_end - seg->pages_base;
1229 1229 }
1230 1230 /*
1231 1231 * The memseg is wholly within the delete span.
1232 1232 * The individual pages can now be checked.
1233 1233 */
1234 1234 /* Cage test. */
1235 1235 for (pp = seg->pages; pp < seg->epages; pp++) {
1236 1236 if (PP_ISNORELOC(pp)) {
1237 1237 ret = KPHYSM_ENONRELOC;
1238 1238 break;
1239 1239 }
1240 1240 }
1241 1241 if (ret != KPHYSM_OK) {
1242 1242 break;
1243 1243 }
1244 1244 phys_pages += (seg->pages_end - mseg_start);
1245 1245 vm_pages += MSEG_NPAGES(seg);
1246 1246 }
1247 1247 if (ret != KPHYSM_OK)
1248 1248 break;
1249 1249 if (pages_checked != mdsp->mds_npgs) {
1250 1250 ret = KPHYSM_ENONRELOC;
1251 1251 break;
1252 1252 }
1253 1253 }
1254 1254
1255 1255 if (ret == KPHYSM_OK) {
1256 1256 mhp->mh_phys_pages += phys_pages;
1257 1257 mhp->mh_vm_pages += vm_pages;
1258 1258 } else {
1259 1259 /*
1260 1260 * Keep holding the mh_mutex to prevent it going away.
1261 1261 */
1262 1262 delspan_remove(&mhp->mh_transit, base, npgs);
1263 1263 }
1264 1264 mutex_exit(&mhp->mh_mutex);
1265 1265 return (ret);
1266 1266 }
1267 1267
1268 1268 int
1269 1269 kphysm_del_span_query(
1270 1270 pfn_t base,
1271 1271 pgcnt_t npgs,
1272 1272 memquery_t *mqp)
1273 1273 {
1274 1274 struct memdelspan *mdsp;
1275 1275 struct memdelspan *mdsp_new;
1276 1276 int done_first_nonreloc;
1277 1277
1278 1278 mqp->phys_pages = 0;
1279 1279 mqp->managed = 0;
1280 1280 mqp->nonrelocatable = 0;
1281 1281 mqp->first_nonrelocatable = 0;
1282 1282 mqp->last_nonrelocatable = 0;
1283 1283
1284 1284 mdsp_new = span_to_install(base, npgs);
1285 1285 /*
1286 1286 * It is OK to proceed here if mdsp_new == NULL.
1287 1287 */
1288 1288 done_first_nonreloc = 0;
1289 1289 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
1290 1290 pfn_t sbase;
1291 1291 pgcnt_t snpgs;
1292 1292
1293 1293 mqp->phys_pages += mdsp->mds_npgs;
1294 1294 sbase = mdsp->mds_base;
1295 1295 snpgs = mdsp->mds_npgs;
1296 1296 while (snpgs != 0) {
1297 1297 struct memseg *lseg, *seg;
1298 1298 pfn_t p_end;
1299 1299 page_t *pp;
1300 1300 pfn_t mseg_start;
1301 1301
1302 1302 p_end = sbase + snpgs;
1303 1303 /*
1304 1304 * Find the lowest addressed memseg that starts
1305 1305 * after sbase and account for it.
1306 1306 * This is to catch dynamic memsegs whose start
1307 1307 * is hidden.
1308 1308 */
1309 1309 seg = NULL;
1310 1310 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
1311 1311 if ((lseg->pages_base >= sbase) ||
1312 1312 (lseg->pages_base < p_end &&
1313 1313 lseg->pages_end > sbase)) {
1314 1314 if (seg == NULL ||
1315 1315 seg->pages_base > lseg->pages_base)
1316 1316 seg = lseg;
1317 1317 }
1318 1318 }
1319 1319 if (seg != NULL) {
1320 1320 mseg_start = memseg_get_start(seg);
1321 1321 /*
1322 1322 * Now have the full extent of the memseg so
1323 1323 * do the range check.
1324 1324 */
1325 1325 if (mseg_start >= p_end ||
1326 1326 seg->pages_end <= sbase) {
1327 1327 /* Span does not overlap memseg. */
1328 1328 seg = NULL;
1329 1329 }
1330 1330 }
1331 1331 /*
1332 1332 * Account for gap either before the segment if
1333 1333 * there is one or to the end of the span.
1334 1334 */
1335 1335 if (seg == NULL || mseg_start > sbase) {
1336 1336 pfn_t a_end;
1337 1337
1338 1338 a_end = (seg == NULL) ? p_end : mseg_start;
1339 1339 /*
1340 1340 * Check with arch layer for relocatability.
1341 1341 */
1342 1342 if (arch_kphysm_del_span_ok(sbase,
1343 1343 (a_end - sbase))) {
1344 1344 /*
1345 1345 * No non-relocatble pages in this
1346 1346 * area, avoid the fine-grained
1347 1347 * test.
1348 1348 */
1349 1349 snpgs -= (a_end - sbase);
1350 1350 sbase = a_end;
1351 1351 }
1352 1352 while (sbase < a_end) {
1353 1353 if (!arch_kphysm_del_span_ok(sbase,
1354 1354 1)) {
1355 1355 mqp->nonrelocatable++;
1356 1356 if (!done_first_nonreloc) {
1357 1357 mqp->
1358 1358 first_nonrelocatable
1359 1359 = sbase;
1360 1360 done_first_nonreloc = 1;
1361 1361 }
1362 1362 mqp->last_nonrelocatable =
1363 1363 sbase;
1364 1364 }
1365 1365 sbase++;
1366 1366 snpgs--;
1367 1367 }
1368 1368 }
1369 1369 if (seg != NULL) {
1370 1370 ASSERT(mseg_start <= sbase);
1371 1371 if (seg->pages_base != mseg_start &&
1372 1372 seg->pages_base > sbase) {
1373 1373 pgcnt_t skip_pgs;
1374 1374
1375 1375 /*
1376 1376 * Skip the page_t area of a
1377 1377 * dynamic memseg.
1378 1378 */
1379 1379 skip_pgs = seg->pages_base - sbase;
1380 1380 if (snpgs <= skip_pgs) {
1381 1381 sbase += snpgs;
1382 1382 snpgs = 0;
1383 1383 continue;
1384 1384 }
1385 1385 snpgs -= skip_pgs;
1386 1386 sbase += skip_pgs;
1387 1387 }
1388 1388 ASSERT(snpgs != 0);
1389 1389 ASSERT(seg->pages_base <= sbase);
1390 1390 /*
1391 1391 * The individual pages can now be checked.
1392 1392 */
1393 1393 for (pp = seg->pages +
1394 1394 (sbase - seg->pages_base);
1395 1395 snpgs != 0 && pp < seg->epages; pp++) {
1396 1396 mqp->managed++;
1397 1397 if (PP_ISNORELOC(pp)) {
1398 1398 mqp->nonrelocatable++;
1399 1399 if (!done_first_nonreloc) {
1400 1400 mqp->
1401 1401 first_nonrelocatable
1402 1402 = sbase;
1403 1403 done_first_nonreloc = 1;
1404 1404 }
1405 1405 mqp->last_nonrelocatable =
1406 1406 sbase;
1407 1407 }
1408 1408 sbase++;
1409 1409 snpgs--;
1410 1410 }
1411 1411 }
1412 1412 }
1413 1413 }
1414 1414
1415 1415 free_delspans(mdsp_new);
1416 1416
1417 1417 return (KPHYSM_OK);
1418 1418 }
1419 1419
1420 1420 /*
1421 1421 * This release function can be called at any stage as follows:
1422 1422 * _gethandle only called
1423 1423 * _span(s) only called
1424 1424 * _start called but failed
1425 1425 * delete thread exited
1426 1426 */
1427 1427 int
1428 1428 kphysm_del_release(memhandle_t handle)
1429 1429 {
1430 1430 struct mem_handle *mhp;
1431 1431
1432 1432 mhp = kphysm_lookup_mem_handle(handle);
1433 1433 if (mhp == NULL) {
1434 1434 return (KPHYSM_EHANDLE);
1435 1435 }
1436 1436 switch (mhp->mh_state) {
1437 1437 case MHND_STARTING:
1438 1438 case MHND_RUNNING:
1439 1439 mutex_exit(&mhp->mh_mutex);
1440 1440 return (KPHYSM_ENOTFINISHED);
1441 1441 case MHND_FREE:
1442 1442 ASSERT(mhp->mh_state != MHND_FREE);
1443 1443 mutex_exit(&mhp->mh_mutex);
1444 1444 return (KPHYSM_EHANDLE);
1445 1445 case MHND_INIT:
1446 1446 break;
1447 1447 case MHND_DONE:
1448 1448 break;
1449 1449 case MHND_RELEASE:
1450 1450 mutex_exit(&mhp->mh_mutex);
1451 1451 return (KPHYSM_ESEQUENCE);
1452 1452 default:
1453 1453 #ifdef DEBUG
1454 1454 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
1455 1455 (void *)mhp, mhp->mh_state);
1456 1456 #endif /* DEBUG */
1457 1457 mutex_exit(&mhp->mh_mutex);
1458 1458 return (KPHYSM_EHANDLE);
1459 1459 }
1460 1460 /*
1461 1461 * Set state so that we can wait if necessary.
1462 1462 * Also this means that we have read/write access to all
1463 1463 * fields except mh_exthandle and mh_state.
1464 1464 */
1465 1465 mhp->mh_state = MHND_RELEASE;
1466 1466 /*
1467 1467 * The mem_handle cannot be de-allocated by any other operation
1468 1468 * now, so no need to hold mh_mutex.
1469 1469 */
1470 1470 mutex_exit(&mhp->mh_mutex);
1471 1471
1472 1472 delspan_remove(&mhp->mh_transit, 0, 0);
1473 1473 mhp->mh_phys_pages = 0;
1474 1474 mhp->mh_vm_pages = 0;
1475 1475 mhp->mh_hold_todo = 0;
1476 1476 mhp->mh_delete_complete = NULL;
1477 1477 mhp->mh_delete_complete_arg = NULL;
1478 1478 mhp->mh_cancel = 0;
1479 1479
1480 1480 mutex_enter(&mhp->mh_mutex);
1481 1481 ASSERT(mhp->mh_state == MHND_RELEASE);
1482 1482 mhp->mh_state = MHND_FREE;
1483 1483
1484 1484 kphysm_free_mem_handle(mhp);
1485 1485
1486 1486 return (KPHYSM_OK);
1487 1487 }
1488 1488
1489 1489 /*
1490 1490 * This cancel function can only be called with the thread running.
1491 1491 */
1492 1492 int
1493 1493 kphysm_del_cancel(memhandle_t handle)
1494 1494 {
1495 1495 struct mem_handle *mhp;
1496 1496
1497 1497 mhp = kphysm_lookup_mem_handle(handle);
1498 1498 if (mhp == NULL) {
1499 1499 return (KPHYSM_EHANDLE);
1500 1500 }
1501 1501 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
1502 1502 mutex_exit(&mhp->mh_mutex);
1503 1503 return (KPHYSM_ENOTRUNNING);
1504 1504 }
1505 1505 /*
1506 1506 * Set the cancel flag and wake the delete thread up.
1507 1507 * The thread may be waiting on I/O, so the effect of the cancel
1508 1508 * may be delayed.
1509 1509 */
1510 1510 if (mhp->mh_cancel == 0) {
1511 1511 mhp->mh_cancel = KPHYSM_ECANCELLED;
1512 1512 cv_signal(&mhp->mh_cv);
1513 1513 }
1514 1514 mutex_exit(&mhp->mh_mutex);
1515 1515 return (KPHYSM_OK);
1516 1516 }
1517 1517
1518 1518 int
1519 1519 kphysm_del_status(
1520 1520 memhandle_t handle,
1521 1521 memdelstat_t *mdstp)
1522 1522 {
1523 1523 struct mem_handle *mhp;
1524 1524
1525 1525 mhp = kphysm_lookup_mem_handle(handle);
1526 1526 if (mhp == NULL) {
1527 1527 return (KPHYSM_EHANDLE);
1528 1528 }
1529 1529 /*
1530 1530 * Calling kphysm_del_status() is allowed before the delete
1531 1531 * is started to allow for status display.
1532 1532 */
1533 1533 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
1534 1534 mhp->mh_state != MHND_RUNNING) {
1535 1535 mutex_exit(&mhp->mh_mutex);
1536 1536 return (KPHYSM_ENOTRUNNING);
1537 1537 }
1538 1538 mdstp->phys_pages = mhp->mh_phys_pages;
1539 1539 mdstp->managed = mhp->mh_vm_pages;
1540 1540 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
1541 1541 mutex_exit(&mhp->mh_mutex);
1542 1542 return (KPHYSM_OK);
1543 1543 }
1544 1544
1545 1545 static int mem_delete_additional_pages = 100;
1546 1546
1547 1547 static int
1548 1548 can_remove_pgs(pgcnt_t npgs)
1549 1549 {
1550 1550 /*
1551 1551 * If all pageable pages were paged out, freemem would
1552 1552 * equal availrmem. There is a minimum requirement for
1553 1553 * availrmem.
1554 1554 */
1555 1555 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
1556 1556 < npgs)
1557 1557 return (0);
1558 1558 /* TODO: check swap space, etc. */
1559 1559 return (1);
1560 1560 }
1561 1561
1562 1562 static int
1563 1563 get_availrmem(pgcnt_t npgs)
1564 1564 {
1565 1565 int ret;
1566 1566
1567 1567 mutex_enter(&freemem_lock);
1568 1568 ret = can_remove_pgs(npgs);
1569 1569 if (ret != 0)
1570 1570 availrmem -= npgs;
1571 1571 mutex_exit(&freemem_lock);
1572 1572 return (ret);
1573 1573 }
1574 1574
1575 1575 static void
1576 1576 put_availrmem(pgcnt_t npgs)
1577 1577 {
1578 1578 mutex_enter(&freemem_lock);
1579 1579 availrmem += npgs;
1580 1580 mutex_exit(&freemem_lock);
1581 1581 }
1582 1582
1583 1583 #define FREEMEM_INCR 100
1584 1584 static pgcnt_t freemem_incr = FREEMEM_INCR;
1585 1585 #define DEL_FREE_WAIT_FRAC 4
1586 1586 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
1587 1587
1588 1588 #define DEL_BUSY_WAIT_FRAC 20
1589 1589 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
1590 1590
1591 1591 static void kphysm_del_cleanup(struct mem_handle *);
1592 1592
1593 1593 static void page_delete_collect(page_t *, struct mem_handle *);
1594 1594
1595 1595 static pgcnt_t
1596 1596 delthr_get_freemem(struct mem_handle *mhp)
1597 1597 {
1598 1598 pgcnt_t free_get;
1599 1599 int ret;
1600 1600
1601 1601 ASSERT(MUTEX_HELD(&mhp->mh_mutex));
1602 1602
1603 1603 MDSTAT_INCR(mhp, need_free);
1604 1604 /*
1605 1605 * Get up to freemem_incr pages.
1606 1606 */
1607 1607 free_get = freemem_incr;
1608 1608 if (free_get > mhp->mh_hold_todo)
1609 1609 free_get = mhp->mh_hold_todo;
1610 1610 /*
1611 1611 * Take free_get pages away from freemem,
1612 1612 * waiting if necessary.
1613 1613 */
1614 1614
1615 1615 while (!mhp->mh_cancel) {
1616 1616 mutex_exit(&mhp->mh_mutex);
1617 1617 MDSTAT_INCR(mhp, free_loop);
1618 1618 /*
1619 1619 * Duplicate test from page_create_throttle()
1620 1620 * but don't override with !PG_WAIT.
1621 1621 */
1622 1622 if (freemem < (free_get + throttlefree)) {
1623 1623 MDSTAT_INCR(mhp, free_low);
1624 1624 ret = 0;
1625 1625 } else {
1626 1626 ret = page_create_wait(free_get, 0);
1627 1627 if (ret == 0) {
1628 1628 /* EMPTY */
1629 1629 MDSTAT_INCR(mhp, free_failed);
1630 1630 }
1631 1631 }
1632 1632 if (ret != 0) {
1633 1633 mutex_enter(&mhp->mh_mutex);
1634 1634 return (free_get);
1635 1635 }
1636 1636
1637 1637 /*
1638 1638 * Put pressure on pageout.
1639 1639 */
1640 1640 page_needfree(free_get);
1641 1641 cv_signal(&proc_pageout->p_cv);
1642 1642
1643 1643 mutex_enter(&mhp->mh_mutex);
1644 1644 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
1645 1645 DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
1646 1646 mutex_exit(&mhp->mh_mutex);
1647 1647 page_needfree(-(spgcnt_t)free_get);
1648 1648
1649 1649 mutex_enter(&mhp->mh_mutex);
1650 1650 }
1651 1651 return (0);
1652 1652 }
1653 1653
1654 1654 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */
1655 1655 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
1656 1656 /*
1657 1657 * This function is run as a helper thread for delete_memory_thread.
1658 1658 * It is needed in order to force kaio cleanup, so that pages used in kaio
1659 1659 * will be unlocked and subsequently relocated by delete_memory_thread.
1660 1660 * The address of the delete_memory_threads's mem_handle is passed in to
1661 1661 * this thread function, and is used to set the mh_aio_cleanup_done member
1662 1662 * prior to calling thread_exit().
1663 1663 */
1664 1664 static void
1665 1665 dr_aio_cleanup_thread(caddr_t amhp)
1666 1666 {
1667 1667 proc_t *procp;
1668 1668 int (*aio_cleanup_dr_delete_memory)(proc_t *);
1669 1669 int cleaned;
1670 1670 int n = 0;
1671 1671 struct mem_handle *mhp;
1672 1672 volatile uint_t *pcancel;
1673 1673
1674 1674 mhp = (struct mem_handle *)amhp;
1675 1675 ASSERT(mhp != NULL);
1676 1676 pcancel = &mhp->mh_dr_aio_cleanup_cancel;
1677 1677 if (modload("sys", "kaio") == -1) {
1678 1678 mhp->mh_aio_cleanup_done = 1;
1679 1679 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
1680 1680 thread_exit();
1681 1681 }
1682 1682 aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
1683 1683 modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
1684 1684 if (aio_cleanup_dr_delete_memory == NULL) {
1685 1685 mhp->mh_aio_cleanup_done = 1;
1686 1686 cmn_err(CE_WARN,
1687 1687 "aio_cleanup_dr_delete_memory not found in kaio");
1688 1688 thread_exit();
1689 1689 }
1690 1690 do {
1691 1691 cleaned = 0;
1692 1692 mutex_enter(&pidlock);
1693 1693 for (procp = practive; (*pcancel == 0) && (procp != NULL);
1694 1694 procp = procp->p_next) {
1695 1695 mutex_enter(&procp->p_lock);
1696 1696 if (procp->p_aio != NULL) {
1697 1697 /* cleanup proc's outstanding kaio */
1698 1698 cleaned +=
1699 1699 (*aio_cleanup_dr_delete_memory)(procp);
1700 1700 }
1701 1701 mutex_exit(&procp->p_lock);
1702 1702 }
1703 1703 mutex_exit(&pidlock);
1704 1704 if ((*pcancel == 0) &&
1705 1705 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
1706 1706 /* delay a bit before retrying all procs again */
1707 1707 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
1708 1708 n = 0;
1709 1709 }
1710 1710 } while (*pcancel == 0);
1711 1711 mhp->mh_aio_cleanup_done = 1;
1712 1712 thread_exit();
1713 1713 }
1714 1714
1715 1715 static void
1716 1716 delete_memory_thread(caddr_t amhp)
1717 1717 {
1718 1718 struct mem_handle *mhp;
1719 1719 struct memdelspan *mdsp;
1720 1720 callb_cpr_t cprinfo;
1721 1721 page_t *pp_targ;
1722 1722 spgcnt_t freemem_left;
1723 1723 void (*del_complete_funcp)(void *, int error);
1724 1724 void *del_complete_arg;
1725 1725 int comp_code;
1726 1726 int ret;
1727 1727 int first_scan;
1728 1728 uint_t szc;
1729 1729 #ifdef MEM_DEL_STATS
1730 1730 uint64_t start_total, ntick_total;
1731 1731 uint64_t start_pgrp, ntick_pgrp;
1732 1732 #endif /* MEM_DEL_STATS */
1733 1733
1734 1734 mhp = (struct mem_handle *)amhp;
1735 1735
1736 1736 #ifdef MEM_DEL_STATS
1737 1737 start_total = ddi_get_lbolt();
1738 1738 #endif /* MEM_DEL_STATS */
1739 1739
1740 1740 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
1741 1741 callb_generic_cpr, "memdel");
1742 1742
1743 1743 mutex_enter(&mhp->mh_mutex);
1744 1744 ASSERT(mhp->mh_state == MHND_STARTING);
1745 1745
1746 1746 mhp->mh_state = MHND_RUNNING;
1747 1747 mhp->mh_thread_id = curthread;
1748 1748
1749 1749 mhp->mh_hold_todo = mhp->mh_vm_pages;
1750 1750 mutex_exit(&mhp->mh_mutex);
1751 1751
1752 1752 /* Allocate the remap pages now, if necessary. */
1753 1753 memseg_remap_init();
1754 1754
1755 1755 /*
1756 1756 * Subtract from availrmem now if possible as availrmem
1757 1757 * may not be available by the end of the delete.
1758 1758 */
1759 1759 if (!get_availrmem(mhp->mh_vm_pages)) {
1760 1760 comp_code = KPHYSM_ENOTVIABLE;
1761 1761 mutex_enter(&mhp->mh_mutex);
1762 1762 goto early_exit;
1763 1763 }
1764 1764
1765 1765 ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
1766 1766
1767 1767 mutex_enter(&mhp->mh_mutex);
1768 1768
1769 1769 if (ret != 0) {
1770 1770 mhp->mh_cancel = KPHYSM_EREFUSED;
1771 1771 goto refused;
1772 1772 }
1773 1773
1774 1774 transit_list_collect(mhp, 1);
1775 1775
1776 1776 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
1777 1777 mdsp = mdsp->mds_next) {
1778 1778 ASSERT(mdsp->mds_bitmap == NULL);
1779 1779 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
1780 1780 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
1781 1781 KM_SLEEP);
1782 1782 }
1783 1783
1784 1784 first_scan = 1;
1785 1785 freemem_left = 0;
1786 1786 /*
1787 1787 * Start dr_aio_cleanup_thread, which periodically iterates
1788 1788 * through the process list and invokes aio cleanup. This
1789 1789 * is needed in order to avoid a deadly embrace between the
1790 1790 * delete_memory_thread (waiting on writer lock for page, with the
1791 1791 * exclusive-wanted bit set), kaio read request threads (waiting for a
1792 1792 * reader lock on the same page that is wanted by the
1793 1793 * delete_memory_thread), and threads waiting for kaio completion
1794 1794 * (blocked on spt_amp->lock).
1795 1795 */
1796 1796 mhp->mh_dr_aio_cleanup_cancel = 0;
1797 1797 mhp->mh_aio_cleanup_done = 0;
1798 1798 (void) thread_create(NULL, 0, dr_aio_cleanup_thread,
1799 1799 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
1800 1800 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
1801 1801 pgcnt_t collected;
1802 1802
1803 1803 MDSTAT_INCR(mhp, nloop);
1804 1804 collected = 0;
1805 1805 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
1806 1806 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
1807 1807 pfn_t pfn, p_end;
1808 1808
1809 1809 p_end = mdsp->mds_base + mdsp->mds_npgs;
1810 1810 for (pfn = mdsp->mds_base; (pfn < p_end) &&
1811 1811 (mhp->mh_cancel == 0); pfn++) {
1812 1812 page_t *pp, *tpp, *tpp_targ;
1813 1813 pgcnt_t bit;
1814 1814 struct vnode *vp;
1815 1815 u_offset_t offset;
1816 1816 int mod, result;
1817 1817 spgcnt_t pgcnt;
1818 1818
1819 1819 bit = pfn - mdsp->mds_base;
1820 1820 if ((mdsp->mds_bitmap[bit / NBPBMW] &
1821 1821 (1 << (bit % NBPBMW))) != 0) {
1822 1822 MDSTAT_INCR(mhp, already_done);
1823 1823 continue;
1824 1824 }
1825 1825 if (freemem_left == 0) {
1826 1826 freemem_left += delthr_get_freemem(mhp);
1827 1827 if (freemem_left == 0)
1828 1828 break;
1829 1829 }
1830 1830
1831 1831 /*
1832 1832 * Release mh_mutex - some of this
1833 1833 * stuff takes some time (eg PUTPAGE).
1834 1834 */
1835 1835
1836 1836 mutex_exit(&mhp->mh_mutex);
1837 1837 MDSTAT_INCR(mhp, ncheck);
1838 1838
1839 1839 pp = page_numtopp_nolock(pfn);
1840 1840 if (pp == NULL) {
1841 1841 /*
1842 1842 * Not covered by a page_t - will
1843 1843 * be dealt with elsewhere.
1844 1844 */
1845 1845 MDSTAT_INCR(mhp, nopaget);
1846 1846 mutex_enter(&mhp->mh_mutex);
1847 1847 mdsp->mds_bitmap[bit / NBPBMW] |=
1848 1848 (1 << (bit % NBPBMW));
1849 1849 continue;
1850 1850 }
1851 1851
1852 1852 if (!page_try_reclaim_lock(pp, SE_EXCL,
1853 1853 SE_EXCL_WANTED | SE_RETIRED)) {
1854 1854 /*
1855 1855 * Page in use elsewhere. Skip it.
1856 1856 */
1857 1857 MDSTAT_INCR(mhp, lockfail);
1858 1858 mutex_enter(&mhp->mh_mutex);
1859 1859 continue;
1860 1860 }
1861 1861 /*
1862 1862 * See if the cage expanded into the delete.
1863 1863 * This can happen as we have to allow the
1864 1864 * cage to expand.
1865 1865 */
1866 1866 if (PP_ISNORELOC(pp)) {
1867 1867 page_unlock(pp);
1868 1868 mutex_enter(&mhp->mh_mutex);
1869 1869 mhp->mh_cancel = KPHYSM_ENONRELOC;
1870 1870 break;
1871 1871 }
1872 1872 if (PP_RETIRED(pp)) {
1873 1873 /*
1874 1874 * Page has been retired and is
1875 1875 * not part of the cage so we
1876 1876 * can now do the accounting for
1877 1877 * it.
1878 1878 */
1879 1879 MDSTAT_INCR(mhp, retired);
1880 1880 mutex_enter(&mhp->mh_mutex);
1881 1881 mdsp->mds_bitmap[bit / NBPBMW]
1882 1882 |= (1 << (bit % NBPBMW));
1883 1883 mdsp->mds_bitmap_retired[bit /
1884 1884 NBPBMW] |=
1885 1885 (1 << (bit % NBPBMW));
1886 1886 mhp->mh_hold_todo--;
1887 1887 continue;
1888 1888 }
1889 1889 ASSERT(freemem_left != 0);
1890 1890 if (PP_ISFREE(pp)) {
1891 1891 /*
1892 1892 * Like page_reclaim() only 'freemem'
1893 1893 * processing is already done.
1894 1894 */
1895 1895 MDSTAT_INCR(mhp, nfree);
1896 1896 free_page_collect:
1897 1897 if (PP_ISAGED(pp)) {
1898 1898 page_list_sub(pp,
1899 1899 PG_FREE_LIST);
1900 1900 } else {
1901 1901 page_list_sub(pp,
1902 1902 PG_CACHE_LIST);
1903 1903 }
1904 1904 PP_CLRFREE(pp);
1905 1905 PP_CLRAGED(pp);
1906 1906 collected++;
1907 1907 mutex_enter(&mhp->mh_mutex);
1908 1908 page_delete_collect(pp, mhp);
1909 1909 mdsp->mds_bitmap[bit / NBPBMW] |=
1910 1910 (1 << (bit % NBPBMW));
1911 1911 freemem_left--;
1912 1912 continue;
1913 1913 }
1914 1914 ASSERT(pp->p_vnode != NULL);
1915 1915 if (first_scan) {
1916 1916 MDSTAT_INCR(mhp, first_notfree);
1917 1917 page_unlock(pp);
1918 1918 mutex_enter(&mhp->mh_mutex);
1919 1919 continue;
1920 1920 }
1921 1921 /*
1922 1922 * Keep stats on pages encountered that
1923 1923 * are marked for retirement.
1924 1924 */
1925 1925 if (PP_TOXIC(pp)) {
1926 1926 MDSTAT_INCR(mhp, toxic);
1927 1927 } else if (PP_PR_REQ(pp)) {
1928 1928 MDSTAT_INCR(mhp, failing);
1929 1929 }
1930 1930 /*
1931 1931 * In certain cases below, special exceptions
1932 1932 * are made for pages that are toxic. This
1933 1933 * is because the current meaning of toxic
1934 1934 * is that an uncorrectable error has been
1935 1935 * previously associated with the page.
1936 1936 */
1937 1937 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1938 1938 if (!PP_TOXIC(pp)) {
1939 1939 /*
1940 1940 * Must relocate locked in
1941 1941 * memory pages.
1942 1942 */
1943 1943 #ifdef MEM_DEL_STATS
1944 1944 start_pgrp = ddi_get_lbolt();
1945 1945 #endif /* MEM_DEL_STATS */
1946 1946 /*
1947 1947 * Lock all constituent pages
1948 1948 * of a large page to ensure
1949 1949 * that p_szc won't change.
1950 1950 */
1951 1951 if (!group_page_trylock(pp,
1952 1952 SE_EXCL)) {
1953 1953 MDSTAT_INCR(mhp,
1954 1954 gptllckfail);
1955 1955 page_unlock(pp);
1956 1956 mutex_enter(
1957 1957 &mhp->mh_mutex);
1958 1958 continue;
1959 1959 }
1960 1960 MDSTAT_INCR(mhp, npplocked);
1961 1961 pp_targ =
1962 1962 page_get_replacement_page(
1963 1963 pp, NULL, 0);
1964 1964 if (pp_targ != NULL) {
1965 1965 #ifdef MEM_DEL_STATS
1966 1966 ntick_pgrp =
1967 1967 (uint64_t)
1968 1968 ddi_get_lbolt() -
1969 1969 start_pgrp;
1970 1970 #endif /* MEM_DEL_STATS */
1971 1971 MDSTAT_PGRP(mhp,
1972 1972 ntick_pgrp);
1973 1973 MDSTAT_INCR(mhp,
1974 1974 nlockreloc);
1975 1975 goto reloc;
1976 1976 }
1977 1977 group_page_unlock(pp);
1978 1978 page_unlock(pp);
1979 1979 #ifdef MEM_DEL_STATS
1980 1980 ntick_pgrp =
1981 1981 (uint64_t)ddi_get_lbolt() -
1982 1982 start_pgrp;
1983 1983 #endif /* MEM_DEL_STATS */
1984 1984 MDSTAT_PGRP(mhp, ntick_pgrp);
1985 1985 MDSTAT_INCR(mhp, nnorepl);
1986 1986 mutex_enter(&mhp->mh_mutex);
1987 1987 continue;
1988 1988 } else {
1989 1989 /*
1990 1990 * Cannot do anything about
1991 1991 * this page because it is
1992 1992 * toxic.
1993 1993 */
1994 1994 MDSTAT_INCR(mhp, npplkdtoxic);
1995 1995 page_unlock(pp);
1996 1996 mutex_enter(&mhp->mh_mutex);
1997 1997 continue;
1998 1998 }
1999 1999 }
2000 2000 /*
2001 2001 * Unload the mappings and check if mod bit
2002 2002 * is set.
2003 2003 */
2004 2004 ASSERT(!PP_ISKAS(pp));
2005 2005 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2006 2006 mod = hat_ismod(pp);
2007 2007
2008 2008 #ifdef MEM_DEL_STATS
2009 2009 start_pgrp = ddi_get_lbolt();
2010 2010 #endif /* MEM_DEL_STATS */
2011 2011 if (mod && !PP_TOXIC(pp)) {
2012 2012 /*
2013 2013 * Lock all constituent pages
2014 2014 * of a large page to ensure
2015 2015 * that p_szc won't change.
2016 2016 */
2017 2017 if (!group_page_trylock(pp, SE_EXCL)) {
2018 2018 MDSTAT_INCR(mhp, gptlmodfail);
2019 2019 page_unlock(pp);
2020 2020 mutex_enter(&mhp->mh_mutex);
2021 2021 continue;
2022 2022 }
2023 2023 pp_targ = page_get_replacement_page(pp,
2024 2024 NULL, 0);
2025 2025 if (pp_targ != NULL) {
2026 2026 MDSTAT_INCR(mhp, nmodreloc);
2027 2027 #ifdef MEM_DEL_STATS
2028 2028 ntick_pgrp =
2029 2029 (uint64_t)ddi_get_lbolt() -
2030 2030 start_pgrp;
2031 2031 #endif /* MEM_DEL_STATS */
2032 2032 MDSTAT_PGRP(mhp, ntick_pgrp);
2033 2033 goto reloc;
2034 2034 }
2035 2035 group_page_unlock(pp);
2036 2036 }
2037 2037
2038 2038 if (!page_try_demote_pages(pp)) {
2039 2039 MDSTAT_INCR(mhp, demotefail);
2040 2040 page_unlock(pp);
2041 2041 #ifdef MEM_DEL_STATS
2042 2042 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2043 2043 start_pgrp;
2044 2044 #endif /* MEM_DEL_STATS */
2045 2045 MDSTAT_PGRP(mhp, ntick_pgrp);
2046 2046 mutex_enter(&mhp->mh_mutex);
2047 2047 continue;
2048 2048 }
2049 2049
2050 2050 /*
2051 2051 * Regular 'page-out'.
2052 2052 */
2053 2053 if (!mod) {
2054 2054 MDSTAT_INCR(mhp, ndestroy);
2055 2055 page_destroy(pp, 1);
2056 2056 /*
2057 2057 * page_destroy was called with
2058 2058 * dontfree. As long as p_lckcnt
2059 2059 * and p_cowcnt are both zero, the
2060 2060 * only additional action of
2061 2061 * page_destroy with !dontfree is to
2062 2062 * call page_free, so we can collect
2063 2063 * the page here.
2064 2064 */
2065 2065 collected++;
2066 2066 #ifdef MEM_DEL_STATS
2067 2067 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2068 2068 start_pgrp;
2069 2069 #endif /* MEM_DEL_STATS */
2070 2070 MDSTAT_PGRP(mhp, ntick_pgrp);
2071 2071 mutex_enter(&mhp->mh_mutex);
2072 2072 page_delete_collect(pp, mhp);
2073 2073 mdsp->mds_bitmap[bit / NBPBMW] |=
2074 2074 (1 << (bit % NBPBMW));
2075 2075 continue;
2076 2076 }
2077 2077 /*
2078 2078 * The page is toxic and the mod bit is
2079 2079 * set, we cannot do anything here to deal
2080 2080 * with it.
2081 2081 */
2082 2082 if (PP_TOXIC(pp)) {
2083 2083 page_unlock(pp);
2084 2084 #ifdef MEM_DEL_STATS
2085 2085 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2086 2086 start_pgrp;
2087 2087 #endif /* MEM_DEL_STATS */
2088 2088 MDSTAT_PGRP(mhp, ntick_pgrp);
2089 2089 MDSTAT_INCR(mhp, modtoxic);
2090 2090 mutex_enter(&mhp->mh_mutex);
2091 2091 continue;
2092 2092 }
2093 2093 MDSTAT_INCR(mhp, nputpage);
2094 2094 vp = pp->p_vnode;
2095 2095 offset = pp->p_offset;
2096 2096 VN_HOLD(vp);
2097 2097 page_unlock(pp);
2098 2098 (void) VOP_PUTPAGE(vp, offset, PAGESIZE,
2099 2099 B_INVAL|B_FORCE, kcred, NULL);
2100 2100 VN_RELE(vp);
2101 2101 #ifdef MEM_DEL_STATS
2102 2102 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2103 2103 start_pgrp;
2104 2104 #endif /* MEM_DEL_STATS */
2105 2105 MDSTAT_PGRP(mhp, ntick_pgrp);
2106 2106 /*
2107 2107 * Try to get the page back immediately
2108 2108 * so that it can be collected.
2109 2109 */
2110 2110 pp = page_numtopp_nolock(pfn);
2111 2111 if (pp == NULL) {
2112 2112 MDSTAT_INCR(mhp, nnoreclaim);
2113 2113 /*
2114 2114 * This should not happen as this
2115 2115 * thread is deleting the page.
2116 2116 * If this code is generalized, this
2117 2117 * becomes a reality.
2118 2118 */
2119 2119 #ifdef DEBUG
2120 2120 cmn_err(CE_WARN,
2121 2121 "delete_memory_thread(0x%p) "
2122 2122 "pfn 0x%lx has no page_t",
2123 2123 (void *)mhp, pfn);
2124 2124 #endif /* DEBUG */
2125 2125 mutex_enter(&mhp->mh_mutex);
2126 2126 continue;
2127 2127 }
2128 2128 if (page_try_reclaim_lock(pp, SE_EXCL,
2129 2129 SE_EXCL_WANTED | SE_RETIRED)) {
2130 2130 if (PP_ISFREE(pp)) {
2131 2131 goto free_page_collect;
2132 2132 }
2133 2133 page_unlock(pp);
2134 2134 }
2135 2135 MDSTAT_INCR(mhp, nnoreclaim);
2136 2136 mutex_enter(&mhp->mh_mutex);
2137 2137 continue;
2138 2138
2139 2139 reloc:
2140 2140 /*
2141 2141 * Got some freemem and a target
2142 2142 * page, so move the data to avoid
2143 2143 * I/O and lock problems.
2144 2144 */
2145 2145 ASSERT(!page_iolock_assert(pp));
2146 2146 MDSTAT_INCR(mhp, nreloc);
2147 2147 /*
2148 2148 * page_relocate() will return pgcnt: the
2149 2149 * number of consecutive pages relocated.
2150 2150 * If it is successful, pp will be a
2151 2151 * linked list of the page structs that
2152 2152 * were relocated. If page_relocate() is
2153 2153 * unsuccessful, pp will be unmodified.
2154 2154 */
2155 2155 #ifdef MEM_DEL_STATS
2156 2156 start_pgrp = ddi_get_lbolt();
2157 2157 #endif /* MEM_DEL_STATS */
2158 2158 result = page_relocate(&pp, &pp_targ, 0, 0,
2159 2159 &pgcnt, NULL);
2160 2160 #ifdef MEM_DEL_STATS
2161 2161 ntick_pgrp = (uint64_t)ddi_get_lbolt() -
2162 2162 start_pgrp;
2163 2163 #endif /* MEM_DEL_STATS */
2164 2164 MDSTAT_PGRP(mhp, ntick_pgrp);
2165 2165 if (result != 0) {
2166 2166 MDSTAT_INCR(mhp, nrelocfail);
2167 2167 /*
2168 2168 * We did not succeed. We need
2169 2169 * to give the pp_targ pages back.
2170 2170 * page_free(pp_targ, 1) without
2171 2171 * the freemem accounting.
2172 2172 */
2173 2173 group_page_unlock(pp);
2174 2174 page_free_replacement_page(pp_targ);
2175 2175 page_unlock(pp);
2176 2176 mutex_enter(&mhp->mh_mutex);
2177 2177 continue;
2178 2178 }
2179 2179
2180 2180 /*
2181 2181 * We will then collect pgcnt pages.
2182 2182 */
2183 2183 ASSERT(pgcnt > 0);
2184 2184 mutex_enter(&mhp->mh_mutex);
2185 2185 /*
2186 2186 * We need to make sure freemem_left is
2187 2187 * large enough.
2188 2188 */
2189 2189 while ((freemem_left < pgcnt) &&
2190 2190 (!mhp->mh_cancel)) {
2191 2191 freemem_left +=
2192 2192 delthr_get_freemem(mhp);
2193 2193 }
2194 2194
2195 2195 /*
2196 2196 * Do not proceed if mh_cancel is set.
2197 2197 */
2198 2198 if (mhp->mh_cancel) {
2199 2199 while (pp_targ != NULL) {
2200 2200 /*
2201 2201 * Unlink and unlock each page.
2202 2202 */
2203 2203 tpp_targ = pp_targ;
2204 2204 page_sub(&pp_targ, tpp_targ);
2205 2205 page_unlock(tpp_targ);
2206 2206 }
2207 2207 /*
2208 2208 * We need to give the pp pages back.
2209 2209 * page_free(pp, 1) without the
2210 2210 * freemem accounting.
2211 2211 */
2212 2212 page_free_replacement_page(pp);
2213 2213 break;
2214 2214 }
2215 2215
2216 2216 /* Now remove pgcnt from freemem_left */
2217 2217 freemem_left -= pgcnt;
2218 2218 ASSERT(freemem_left >= 0);
2219 2219 szc = pp->p_szc;
2220 2220 while (pp != NULL) {
2221 2221 /*
2222 2222 * pp and pp_targ were passed back as
2223 2223 * a linked list of pages.
2224 2224 * Unlink and unlock each page.
2225 2225 */
2226 2226 tpp_targ = pp_targ;
2227 2227 page_sub(&pp_targ, tpp_targ);
2228 2228 page_unlock(tpp_targ);
2229 2229 /*
2230 2230 * The original page is now free
2231 2231 * so remove it from the linked
2232 2232 * list and collect it.
2233 2233 */
2234 2234 tpp = pp;
2235 2235 page_sub(&pp, tpp);
2236 2236 pfn = page_pptonum(tpp);
2237 2237 collected++;
2238 2238 ASSERT(PAGE_EXCL(tpp));
2239 2239 ASSERT(tpp->p_vnode == NULL);
2240 2240 ASSERT(!hat_page_is_mapped(tpp));
2241 2241 ASSERT(tpp->p_szc == szc);
2242 2242 tpp->p_szc = 0;
2243 2243 page_delete_collect(tpp, mhp);
2244 2244 bit = pfn - mdsp->mds_base;
2245 2245 mdsp->mds_bitmap[bit / NBPBMW] |=
2246 2246 (1 << (bit % NBPBMW));
2247 2247 }
2248 2248 ASSERT(pp_targ == NULL);
2249 2249 }
2250 2250 }
2251 2251 first_scan = 0;
2252 2252 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
2253 2253 (collected == 0)) {
2254 2254 /*
2255 2255 * This code is needed as we cannot wait
2256 2256 * for a page to be locked OR the delete to
2257 2257 * be cancelled. Also, we must delay so
2258 2258 * that other threads get a chance to run
2259 2259 * on our cpu, otherwise page locks may be
2260 2260 * held indefinitely by those threads.
2261 2261 */
2262 2262 MDSTAT_INCR(mhp, ndelay);
2263 2263 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2264 2264 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
2265 2265 DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
2266 2266 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2267 2267 }
2268 2268 }
2269 2269 /* stop the dr aio cleanup thread */
2270 2270 mhp->mh_dr_aio_cleanup_cancel = 1;
2271 2271 transit_list_collect(mhp, 0);
2272 2272 if (freemem_left != 0) {
2273 2273 /* Return any surplus. */
2274 2274 page_create_putback(freemem_left);
2275 2275 freemem_left = 0;
2276 2276 }
2277 2277 #ifdef MEM_DEL_STATS
2278 2278 ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
2279 2279 #endif /* MEM_DEL_STATS */
2280 2280 MDSTAT_TOTAL(mhp, ntick_total);
2281 2281 MDSTAT_PRINT(mhp);
2282 2282
2283 2283 /*
2284 2284 * If the memory delete was cancelled, exclusive-wanted bits must
2285 2285 * be cleared. If there are retired pages being deleted, they need
2286 2286 * to be unretired.
2287 2287 */
2288 2288 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2289 2289 mdsp = mdsp->mds_next) {
2290 2290 pfn_t pfn, p_end;
2291 2291
2292 2292 p_end = mdsp->mds_base + mdsp->mds_npgs;
2293 2293 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
2294 2294 page_t *pp;
2295 2295 pgcnt_t bit;
2296 2296
2297 2297 bit = pfn - mdsp->mds_base;
2298 2298 if (mhp->mh_cancel) {
2299 2299 pp = page_numtopp_nolock(pfn);
2300 2300 if (pp != NULL) {
2301 2301 if ((mdsp->mds_bitmap[bit / NBPBMW] &
2302 2302 (1 << (bit % NBPBMW))) == 0) {
2303 2303 page_lock_clr_exclwanted(pp);
2304 2304 }
2305 2305 }
2306 2306 } else {
2307 2307 pp = NULL;
2308 2308 }
2309 2309 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
2310 2310 (1 << (bit % NBPBMW))) != 0) {
2311 2311 /* do we already have pp? */
2312 2312 if (pp == NULL) {
2313 2313 pp = page_numtopp_nolock(pfn);
2314 2314 }
2315 2315 ASSERT(pp != NULL);
2316 2316 ASSERT(PP_RETIRED(pp));
2317 2317 if (mhp->mh_cancel != 0) {
2318 2318 page_unlock(pp);
2319 2319 /*
2320 2320 * To satisfy ASSERT below in
2321 2321 * cancel code.
2322 2322 */
2323 2323 mhp->mh_hold_todo++;
2324 2324 } else {
2325 2325 (void) page_unretire_pp(pp,
2326 2326 PR_UNR_CLEAN);
2327 2327 }
2328 2328 }
2329 2329 }
2330 2330 }
2331 2331 /*
2332 2332 * Free retired page bitmap and collected page bitmap
2333 2333 */
2334 2334 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2335 2335 mdsp = mdsp->mds_next) {
2336 2336 ASSERT(mdsp->mds_bitmap_retired != NULL);
2337 2337 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
2338 2338 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */
2339 2339 ASSERT(mdsp->mds_bitmap != NULL);
2340 2340 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
2341 2341 mdsp->mds_bitmap = NULL; /* Paranoia. */
2342 2342 }
2343 2343
2344 2344 /* wait for our dr aio cancel thread to exit */
2345 2345 while (!(mhp->mh_aio_cleanup_done)) {
2346 2346 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2347 2347 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
2348 2348 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
2349 2349 }
2350 2350 refused:
2351 2351 if (mhp->mh_cancel != 0) {
2352 2352 page_t *pp;
2353 2353
2354 2354 comp_code = mhp->mh_cancel;
2355 2355 /*
2356 2356 * Go through list of deleted pages (mh_deleted) freeing
2357 2357 * them.
2358 2358 */
2359 2359 while ((pp = mhp->mh_deleted) != NULL) {
2360 2360 mhp->mh_deleted = pp->p_next;
2361 2361 mhp->mh_hold_todo++;
2362 2362 mutex_exit(&mhp->mh_mutex);
2363 2363 /* Restore p_next. */
2364 2364 pp->p_next = pp->p_prev;
2365 2365 if (PP_ISFREE(pp)) {
2366 2366 cmn_err(CE_PANIC,
2367 2367 "page %p is free",
2368 2368 (void *)pp);
2369 2369 }
2370 2370 page_free(pp, 1);
2371 2371 mutex_enter(&mhp->mh_mutex);
2372 2372 }
2373 2373 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
2374 2374
2375 2375 mutex_exit(&mhp->mh_mutex);
2376 2376 put_availrmem(mhp->mh_vm_pages);
2377 2377 mutex_enter(&mhp->mh_mutex);
2378 2378
2379 2379 goto t_exit;
2380 2380 }
2381 2381
2382 2382 /*
2383 2383 * All the pages are no longer in use and are exclusively locked.
2384 2384 */
2385 2385
2386 2386 mhp->mh_deleted = NULL;
2387 2387
2388 2388 kphysm_del_cleanup(mhp);
2389 2389
2390 2390 /*
2391 2391 * mem_node_del_range needs to be after kphysm_del_cleanup so
2392 2392 * that the mem_node_config[] will remain intact for the cleanup.
2393 2393 */
2394 2394 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2395 2395 mdsp = mdsp->mds_next) {
2396 2396 mem_node_del_range(mdsp->mds_base,
2397 2397 mdsp->mds_base + mdsp->mds_npgs - 1);
2398 2398 }
2399 2399 /* cleanup the page counters */
2400 2400 page_ctrs_cleanup();
2401 2401
2402 2402 comp_code = KPHYSM_OK;
2403 2403
2404 2404 t_exit:
2405 2405 mutex_exit(&mhp->mh_mutex);
2406 2406 kphysm_setup_post_del(mhp->mh_vm_pages,
2407 2407 (comp_code == KPHYSM_OK) ? 0 : 1);
2408 2408 mutex_enter(&mhp->mh_mutex);
2409 2409
2410 2410 early_exit:
2411 2411 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
2412 2412 mhp->mh_state = MHND_DONE;
2413 2413 del_complete_funcp = mhp->mh_delete_complete;
2414 2414 del_complete_arg = mhp->mh_delete_complete_arg;
2415 2415 CALLB_CPR_EXIT(&cprinfo);
2416 2416 (*del_complete_funcp)(del_complete_arg, comp_code);
2417 2417 thread_exit();
2418 2418 /*NOTREACHED*/
2419 2419 }
2420 2420
2421 2421 /*
2422 2422 * Start the delete of the memory from the system.
2423 2423 */
2424 2424 int
2425 2425 kphysm_del_start(
2426 2426 memhandle_t handle,
2427 2427 void (*complete)(void *, int),
2428 2428 void *complete_arg)
2429 2429 {
2430 2430 struct mem_handle *mhp;
2431 2431
2432 2432 mhp = kphysm_lookup_mem_handle(handle);
2433 2433 if (mhp == NULL) {
2434 2434 return (KPHYSM_EHANDLE);
2435 2435 }
2436 2436 switch (mhp->mh_state) {
2437 2437 case MHND_FREE:
2438 2438 ASSERT(mhp->mh_state != MHND_FREE);
2439 2439 mutex_exit(&mhp->mh_mutex);
2440 2440 return (KPHYSM_EHANDLE);
2441 2441 case MHND_INIT:
2442 2442 break;
2443 2443 case MHND_STARTING:
2444 2444 case MHND_RUNNING:
2445 2445 mutex_exit(&mhp->mh_mutex);
2446 2446 return (KPHYSM_ESEQUENCE);
2447 2447 case MHND_DONE:
2448 2448 mutex_exit(&mhp->mh_mutex);
2449 2449 return (KPHYSM_ESEQUENCE);
2450 2450 case MHND_RELEASE:
2451 2451 mutex_exit(&mhp->mh_mutex);
2452 2452 return (KPHYSM_ESEQUENCE);
2453 2453 default:
2454 2454 #ifdef DEBUG
2455 2455 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
2456 2456 (void *)mhp, mhp->mh_state);
2457 2457 #endif /* DEBUG */
2458 2458 mutex_exit(&mhp->mh_mutex);
2459 2459 return (KPHYSM_EHANDLE);
2460 2460 }
2461 2461
2462 2462 if (mhp->mh_transit.trl_spans == NULL) {
2463 2463 mutex_exit(&mhp->mh_mutex);
2464 2464 return (KPHYSM_ENOWORK);
2465 2465 }
2466 2466
2467 2467 ASSERT(complete != NULL);
2468 2468 mhp->mh_delete_complete = complete;
2469 2469 mhp->mh_delete_complete_arg = complete_arg;
2470 2470 mhp->mh_state = MHND_STARTING;
2471 2471 /*
2472 2472 * Release the mutex in case thread_create sleeps.
2473 2473 */
2474 2474 mutex_exit(&mhp->mh_mutex);
2475 2475
2476 2476 /*
2477 2477 * The "obvious" process for this thread is pageout (proc_pageout)
2478 2478 * but this gives the thread too much power over freemem
2479 2479 * which results in freemem starvation.
2480 2480 */
2481 2481 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
2482 2482 TS_RUN, maxclsyspri - 1);
2483 2483
2484 2484 return (KPHYSM_OK);
2485 2485 }
2486 2486
2487 2487 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */
2488 2488 static caddr_t pp_dummy;
2489 2489 static pgcnt_t pp_dummy_npages;
2490 2490 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */
2491 2491
2492 2492 static void
2493 2493 memseg_remap_init_pages(page_t *pages, page_t *epages)
2494 2494 {
2495 2495 page_t *pp;
2496 2496
2497 2497 for (pp = pages; pp < epages; pp++) {
2498 2498 pp->p_pagenum = PFN_INVALID; /* XXXX */
2499 2499 pp->p_offset = (u_offset_t)-1;
2500 2500 page_iolock_init(pp);
2501 2501 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
2502 2502 continue;
2503 2503 page_lock_delete(pp);
2504 2504 }
2505 2505 }
2506 2506
2507 2507 void
2508 2508 memseg_remap_init()
2509 2509 {
2510 2510 mutex_enter(&pp_dummy_lock);
2511 2511 if (pp_dummy == NULL) {
2512 2512 uint_t dpages;
2513 2513 int i;
2514 2514
2515 2515 /*
2516 2516 * dpages starts off as the size of the structure and
2517 2517 * ends up as the minimum number of pages that will
2518 2518 * hold a whole number of page_t structures.
2519 2519 */
2520 2520 dpages = sizeof (page_t);
2521 2521 ASSERT(dpages != 0);
2522 2522 ASSERT(dpages <= MMU_PAGESIZE);
2523 2523
2524 2524 while ((dpages & 1) == 0)
2525 2525 dpages >>= 1;
2526 2526
2527 2527 pp_dummy_npages = dpages;
2528 2528 /*
2529 2529 * Allocate pp_dummy pages directly from static_arena,
2530 2530 * since these are whole page allocations and are
2531 2531 * referenced by physical address. This also has the
2532 2532 * nice fringe benefit of hiding the memory from
2533 2533 * ::findleaks since it doesn't deal well with allocated
2534 2534 * kernel heap memory that doesn't have any mappings.
2535 2535 */
2536 2536 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
2537 2537 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
2538 2538 bzero(pp_dummy, ptob(pp_dummy_npages));
2539 2539 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
2540 2540 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
2541 2541 pp_dummy_npages, KM_SLEEP);
2542 2542 for (i = 0; i < pp_dummy_npages; i++) {
2543 2543 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
2544 2544 &pp_dummy[MMU_PAGESIZE * i]);
2545 2545 ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
2546 2546 }
2547 2547 /*
2548 2548 * Initialize the page_t's to a known 'deleted' state
2549 2549 * that matches the state of deleted pages.
2550 2550 */
2551 2551 memseg_remap_init_pages((page_t *)pp_dummy,
2552 2552 (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
2553 2553 /* Remove kmem mappings for the pages for safety. */
2554 2554 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
2555 2555 HAT_UNLOAD_UNLOCK);
2556 2556 /* Leave pp_dummy pointer set as flag that init is done. */
2557 2557 }
2558 2558 mutex_exit(&pp_dummy_lock);
2559 2559 }
2560 2560
2561 2561 /*
2562 2562 * Remap a page-aglined range of page_t's to dummy pages.
2563 2563 */
2564 2564 void
2565 2565 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
2566 2566 {
2567 2567 int phase;
2568 2568
2569 2569 ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));
2570 2570
2571 2571 /*
2572 2572 * We may start remapping at a non-zero page offset
2573 2573 * within the dummy pages since the low/high ends
2574 2574 * of the outgoing pp's could be shared by other
2575 2575 * memsegs (see memseg_remap_meta).
2576 2576 */
2577 2577 phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
2578 2578 /*CONSTCOND*/
2579 2579 ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
2580 2580
2581 2581 while (metapgs != 0) {
2582 2582 pgcnt_t n;
2583 2583 int i, j;
2584 2584
2585 2585 n = pp_dummy_npages;
2586 2586 if (n > metapgs)
2587 2587 n = metapgs;
2588 2588 for (i = 0; i < n; i++) {
2589 2589 j = (i + phase) % pp_dummy_npages;
2590 2590 hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
2591 2591 PROT_READ,
2592 2592 HAT_LOAD | HAT_LOAD_NOCONSIST |
2593 2593 HAT_LOAD_REMAP);
2594 2594 va += ptob(1);
2595 2595 }
2596 2596 metapgs -= n;
2597 2597 }
2598 2598 }
2599 2599
2600 2600 static void
2601 2601 memseg_remap_to_dummy(struct memseg *seg)
2602 2602 {
2603 2603 caddr_t pp;
2604 2604 pgcnt_t metapgs;
2605 2605
2606 2606 ASSERT(memseg_is_dynamic(seg));
2607 2607 ASSERT(pp_dummy != NULL);
2608 2608
2609 2609
2610 2610 if (!memseg_includes_meta(seg)) {
2611 2611 memseg_remap_meta(seg);
2612 2612 return;
2613 2613 }
2614 2614
2615 2615 pp = (caddr_t)seg->pages;
2616 2616 metapgs = seg->pages_base - memseg_get_start(seg);
2617 2617 ASSERT(metapgs != 0);
2618 2618
2619 2619 seg->pages_end = seg->pages_base;
2620 2620
2621 2621 remap_to_dummy(pp, metapgs);
2622 2622 }
2623 2623
2624 2624 /*
2625 2625 * Transition all the deleted pages to the deleted state so that
2626 2626 * page_lock will not wait. The page_lock_delete call will
2627 2627 * also wake up any waiters.
2628 2628 */
2629 2629 static void
2630 2630 memseg_lock_delete_all(struct memseg *seg)
2631 2631 {
2632 2632 page_t *pp;
2633 2633
2634 2634 for (pp = seg->pages; pp < seg->epages; pp++) {
2635 2635 pp->p_pagenum = PFN_INVALID; /* XXXX */
2636 2636 page_lock_delete(pp);
2637 2637 }
2638 2638 }
2639 2639
2640 2640 static void
2641 2641 kphysm_del_cleanup(struct mem_handle *mhp)
2642 2642 {
2643 2643 struct memdelspan *mdsp;
2644 2644 struct memseg *seg;
2645 2645 struct memseg **segpp;
2646 2646 struct memseg *seglist;
2647 2647 pfn_t p_end;
2648 2648 uint64_t avmem;
2649 2649 pgcnt_t avpgs;
2650 2650 pgcnt_t npgs;
2651 2651
2652 2652 avpgs = mhp->mh_vm_pages;
2653 2653
2654 2654 memsegs_lock(1);
2655 2655
2656 2656 /*
2657 2657 * remove from main segment list.
2658 2658 */
2659 2659 npgs = 0;
2660 2660 seglist = NULL;
2661 2661 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
2662 2662 mdsp = mdsp->mds_next) {
2663 2663 p_end = mdsp->mds_base + mdsp->mds_npgs;
2664 2664 for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
2665 2665 if (seg->pages_base >= p_end ||
2666 2666 seg->pages_end <= mdsp->mds_base) {
2667 2667 /* Span and memseg don't overlap. */
2668 2668 segpp = &((*segpp)->next);
2669 2669 continue;
2670 2670 }
2671 2671 ASSERT(seg->pages_base >= mdsp->mds_base);
2672 2672 ASSERT(seg->pages_end <= p_end);
2673 2673
2674 2674 PLCNT_MODIFY_MAX(seg->pages_base,
2675 2675 seg->pages_base - seg->pages_end);
2676 2676
2677 2677 /* Hide the memseg from future scans. */
2678 2678 hat_kpm_delmem_mseg_update(seg, segpp);
2679 2679 *segpp = seg->next;
2680 2680 membar_producer(); /* TODO: Needed? */
2681 2681 npgs += MSEG_NPAGES(seg);
2682 2682
2683 2683 /*
2684 2684 * Leave the deleted segment's next pointer intact
2685 2685 * in case a memsegs scanning loop is walking this
2686 2686 * segment concurrently.
2687 2687 */
2688 2688 seg->lnext = seglist;
2689 2689 seglist = seg;
2690 2690 }
2691 2691 }
2692 2692
2693 2693 build_pfn_hash();
2694 2694
2695 2695 ASSERT(npgs < total_pages);
2696 2696 total_pages -= npgs;
2697 2697
2698 2698 /*
2699 2699 * Recalculate the paging parameters now total_pages has changed.
2700 2700 * This will also cause the clock hands to be reset before next use.
2701 2701 */
2702 2702 setupclock(1);
2703 2703
2704 2704 memsegs_unlock(1);
2705 2705
2706 2706 mutex_exit(&mhp->mh_mutex);
2707 2707
2708 2708 while ((seg = seglist) != NULL) {
2709 2709 pfn_t mseg_start;
2710 2710 pfn_t mseg_base, mseg_end;
2711 2711 pgcnt_t mseg_npgs;
2712 2712 int mlret;
2713 2713
2714 2714 seglist = seg->lnext;
2715 2715
2716 2716 /*
2717 2717 * Put the page_t's into the deleted state to stop
2718 2718 * cv_wait()s on the pages. When we remap, the dummy
2719 2719 * page_t's will be in the same state.
2720 2720 */
2721 2721 memseg_lock_delete_all(seg);
2722 2722 /*
2723 2723 * Collect up information based on pages_base and pages_end
2724 2724 * early so that we can flag early that the memseg has been
2725 2725 * deleted by setting pages_end == pages_base.
2726 2726 */
2727 2727 mseg_base = seg->pages_base;
2728 2728 mseg_end = seg->pages_end;
2729 2729 mseg_npgs = MSEG_NPAGES(seg);
2730 2730 mseg_start = memseg_get_start(seg);
2731 2731
2732 2732 if (memseg_is_dynamic(seg)) {
2733 2733 /* Remap the meta data to our special dummy area. */
2734 2734 memseg_remap_to_dummy(seg);
2735 2735
2736 2736 mutex_enter(&memseg_lists_lock);
2737 2737 seg->lnext = memseg_va_avail;
2738 2738 memseg_va_avail = seg;
2739 2739 mutex_exit(&memseg_lists_lock);
2740 2740 } else {
2741 2741 /*
2742 2742 * For memory whose page_ts were allocated
2743 2743 * at boot, we need to find a new use for
2744 2744 * the page_t memory.
2745 2745 * For the moment, just leak it.
2746 2746 * (It is held in the memseg_delete_junk list.)
2747 2747 */
2748 2748 seg->pages_end = seg->pages_base;
2749 2749
2750 2750 mutex_enter(&memseg_lists_lock);
2751 2751 seg->lnext = memseg_delete_junk;
2752 2752 memseg_delete_junk = seg;
2753 2753 mutex_exit(&memseg_lists_lock);
2754 2754 }
2755 2755
2756 2756 /* Must not use seg now as it could be re-used. */
2757 2757
2758 2758 memlist_write_lock();
2759 2759
2760 2760 mlret = memlist_delete_span(
2761 2761 (uint64_t)(mseg_base) << PAGESHIFT,
2762 2762 (uint64_t)(mseg_npgs) << PAGESHIFT,
2763 2763 &phys_avail);
2764 2764 ASSERT(mlret == MEML_SPANOP_OK);
2765 2765
2766 2766 mlret = memlist_delete_span(
2767 2767 (uint64_t)(mseg_start) << PAGESHIFT,
2768 2768 (uint64_t)(mseg_end - mseg_start) <<
2769 2769 PAGESHIFT,
2770 2770 &phys_install);
2771 2771 ASSERT(mlret == MEML_SPANOP_OK);
2772 2772 phys_install_has_changed();
2773 2773
2774 2774 memlist_write_unlock();
2775 2775 }
2776 2776
2777 2777 memlist_read_lock();
2778 2778 installed_top_size(phys_install, &physmax, &physinstalled);
2779 2779 memlist_read_unlock();
2780 2780
2781 2781 mutex_enter(&freemem_lock);
2782 2782 maxmem -= avpgs;
2783 2783 physmem -= avpgs;
2784 2784 /* availrmem is adjusted during the delete. */
2785 2785 availrmem_initial -= avpgs;
2786 2786
2787 2787 mutex_exit(&freemem_lock);
2788 2788
2789 2789 dump_resize();
2790 2790
2791 2791 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
2792 2792 "(0x%" PRIx64 ")\n",
2793 2793 physinstalled << (PAGESHIFT - 10),
2794 2794 (uint64_t)physinstalled << PAGESHIFT);
2795 2795
2796 2796 avmem = (uint64_t)freemem << PAGESHIFT;
2797 2797 cmn_err(CE_CONT, "?kphysm_delete: "
2798 2798 "avail mem = %" PRId64 "\n", avmem);
2799 2799
2800 2800 /*
2801 2801 * Update lgroup generation number on single lgroup systems
2802 2802 */
2803 2803 if (nlgrps == 1)
2804 2804 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
2805 2805
2806 2806 /* Successfully deleted system memory */
2807 2807 mutex_enter(&mhp->mh_mutex);
2808 2808 }
2809 2809
2810 2810 static uint_t mdel_nullvp_waiter;
2811 2811
2812 2812 static void
2813 2813 page_delete_collect(
2814 2814 page_t *pp,
2815 2815 struct mem_handle *mhp)
2816 2816 {
2817 2817 if (pp->p_vnode) {
2818 2818 page_hashout(pp, (kmutex_t *)NULL);
2819 2819 /* do not do PP_SETAGED(pp); */
2820 2820 } else {
2821 2821 kmutex_t *sep;
2822 2822
2823 2823 sep = page_se_mutex(pp);
2824 2824 mutex_enter(sep);
2825 2825 if (CV_HAS_WAITERS(&pp->p_cv)) {
2826 2826 mdel_nullvp_waiter++;
2827 2827 cv_broadcast(&pp->p_cv);
2828 2828 }
2829 2829 mutex_exit(sep);
2830 2830 }
2831 2831 ASSERT(pp->p_next == pp->p_prev);
2832 2832 ASSERT(pp->p_next == NULL || pp->p_next == pp);
2833 2833 pp->p_next = mhp->mh_deleted;
2834 2834 mhp->mh_deleted = pp;
2835 2835 ASSERT(mhp->mh_hold_todo != 0);
2836 2836 mhp->mh_hold_todo--;
2837 2837 }
2838 2838
2839 2839 static void
2840 2840 transit_list_collect(struct mem_handle *mhp, int v)
2841 2841 {
2842 2842 struct transit_list_head *trh;
2843 2843
2844 2844 trh = &transit_list_head;
2845 2845 mutex_enter(&trh->trh_lock);
2846 2846 mhp->mh_transit.trl_collect = v;
2847 2847 mutex_exit(&trh->trh_lock);
2848 2848 }
2849 2849
2850 2850 static void
2851 2851 transit_list_insert(struct transit_list *tlp)
2852 2852 {
2853 2853 struct transit_list_head *trh;
2854 2854
2855 2855 trh = &transit_list_head;
2856 2856 ASSERT(MUTEX_HELD(&trh->trh_lock));
2857 2857 tlp->trl_next = trh->trh_head;
2858 2858 trh->trh_head = tlp;
2859 2859 }
2860 2860
2861 2861 static void
2862 2862 transit_list_remove(struct transit_list *tlp)
2863 2863 {
2864 2864 struct transit_list_head *trh;
2865 2865 struct transit_list **tlpp;
2866 2866
2867 2867 trh = &transit_list_head;
2868 2868 tlpp = &trh->trh_head;
2869 2869 ASSERT(MUTEX_HELD(&trh->trh_lock));
2870 2870 while (*tlpp != NULL && *tlpp != tlp)
2871 2871 tlpp = &(*tlpp)->trl_next;
2872 2872 ASSERT(*tlpp != NULL);
2873 2873 if (*tlpp == tlp)
2874 2874 *tlpp = tlp->trl_next;
2875 2875 tlp->trl_next = NULL;
2876 2876 }
2877 2877
2878 2878 static struct transit_list *
2879 2879 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
2880 2880 {
2881 2881 struct transit_list *tlp;
2882 2882
2883 2883 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
2884 2884 struct memdelspan *mdsp;
2885 2885
2886 2886 for (mdsp = tlp->trl_spans; mdsp != NULL;
2887 2887 mdsp = mdsp->mds_next) {
2888 2888 if (pfnum >= mdsp->mds_base &&
2889 2889 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
2890 2890 return (tlp);
2891 2891 }
2892 2892 }
2893 2893 }
2894 2894 return (NULL);
2895 2895 }
2896 2896
2897 2897 int
2898 2898 pfn_is_being_deleted(pfn_t pfnum)
2899 2899 {
2900 2900 struct transit_list_head *trh;
2901 2901 struct transit_list *tlp;
2902 2902 int ret;
2903 2903
2904 2904 trh = &transit_list_head;
2905 2905 if (trh->trh_head == NULL)
2906 2906 return (0);
2907 2907
2908 2908 mutex_enter(&trh->trh_lock);
2909 2909 tlp = pfnum_to_transit_list(trh, pfnum);
2910 2910 ret = (tlp != NULL && tlp->trl_collect);
2911 2911 mutex_exit(&trh->trh_lock);
2912 2912
2913 2913 return (ret);
2914 2914 }
2915 2915
2916 2916 #ifdef MEM_DEL_STATS
2917 2917 extern int hz;
2918 2918 static void
2919 2919 mem_del_stat_print_func(struct mem_handle *mhp)
2920 2920 {
2921 2921 uint64_t tmp;
2922 2922
2923 2923 if (mem_del_stat_print) {
2924 2924 printf("memory delete loop %x/%x, statistics%s\n",
2925 2925 (uint_t)mhp->mh_transit.trl_spans->mds_base,
2926 2926 (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
2927 2927 (mhp->mh_cancel ? " (cancelled)" : ""));
2928 2928 printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
2929 2929 printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
2930 2930 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
2931 2931 printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
2932 2932 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
2933 2933 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
2934 2934 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
2935 2935 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
2936 2936 printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
2937 2937 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
2938 2938 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
2939 2939 printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
2940 2940 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
2941 2941 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
2942 2942 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
2943 2943 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
2944 2944 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
2945 2945 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
2946 2946 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
2947 2947 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
2948 2948 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
2949 2949 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
2950 2950 printf("\t%8u retired\n", mhp->mh_delstat.retired);
2951 2951 printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
2952 2952 printf("\t%8u failing\n", mhp->mh_delstat.failing);
2953 2953 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
2954 2954 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
2955 2955 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
2956 2956 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
2957 2957 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */
2958 2958 printf(
2959 2959 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
2960 2960 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
2961 2961
2962 2962 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */
2963 2963 printf(
2964 2964 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
2965 2965 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
2966 2966 }
2967 2967 }
2968 2968 #endif /* MEM_DEL_STATS */
2969 2969
2970 2970 struct mem_callback {
2971 2971 kphysm_setup_vector_t *vec;
2972 2972 void *arg;
2973 2973 };
2974 2974
2975 2975 #define NMEMCALLBACKS 100
2976 2976
2977 2977 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
2978 2978 static uint_t nmemcallbacks;
2979 2979 static krwlock_t mem_callback_rwlock;
2980 2980
2981 2981 int
2982 2982 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
2983 2983 {
2984 2984 uint_t i, found;
2985 2985
2986 2986 /*
2987 2987 * This test will become more complicated when the version must
2988 2988 * change.
2989 2989 */
2990 2990 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
2991 2991 return (EINVAL);
2992 2992
2993 2993 if (vec->post_add == NULL || vec->pre_del == NULL ||
2994 2994 vec->post_del == NULL)
2995 2995 return (EINVAL);
2996 2996
2997 2997 rw_enter(&mem_callback_rwlock, RW_WRITER);
2998 2998 for (i = 0, found = 0; i < nmemcallbacks; i++) {
2999 2999 if (mem_callbacks[i].vec == NULL && found == 0)
3000 3000 found = i + 1;
3001 3001 if (mem_callbacks[i].vec == vec &&
3002 3002 mem_callbacks[i].arg == arg) {
3003 3003 #ifdef DEBUG
3004 3004 /* Catch this in DEBUG kernels. */
3005 3005 cmn_err(CE_WARN, "kphysm_setup_func_register"
3006 3006 "(0x%p, 0x%p) duplicate registration from 0x%p",
3007 3007 (void *)vec, arg, (void *)caller());
3008 3008 #endif /* DEBUG */
3009 3009 rw_exit(&mem_callback_rwlock);
3010 3010 return (EEXIST);
3011 3011 }
3012 3012 }
3013 3013 if (found != 0) {
3014 3014 i = found - 1;
3015 3015 } else {
3016 3016 ASSERT(nmemcallbacks < NMEMCALLBACKS);
3017 3017 if (nmemcallbacks == NMEMCALLBACKS) {
3018 3018 rw_exit(&mem_callback_rwlock);
3019 3019 return (ENOMEM);
3020 3020 }
3021 3021 i = nmemcallbacks++;
3022 3022 }
3023 3023 mem_callbacks[i].vec = vec;
3024 3024 mem_callbacks[i].arg = arg;
3025 3025 rw_exit(&mem_callback_rwlock);
3026 3026 return (0);
3027 3027 }
3028 3028
3029 3029 void
3030 3030 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
3031 3031 {
3032 3032 uint_t i;
3033 3033
3034 3034 rw_enter(&mem_callback_rwlock, RW_WRITER);
3035 3035 for (i = 0; i < nmemcallbacks; i++) {
3036 3036 if (mem_callbacks[i].vec == vec &&
3037 3037 mem_callbacks[i].arg == arg) {
3038 3038 mem_callbacks[i].vec = NULL;
3039 3039 mem_callbacks[i].arg = NULL;
3040 3040 if (i == (nmemcallbacks - 1))
3041 3041 nmemcallbacks--;
3042 3042 break;
3043 3043 }
3044 3044 }
3045 3045 rw_exit(&mem_callback_rwlock);
3046 3046 }
3047 3047
3048 3048 static void
3049 3049 kphysm_setup_post_add(pgcnt_t delta_pages)
3050 3050 {
3051 3051 uint_t i;
3052 3052
3053 3053 rw_enter(&mem_callback_rwlock, RW_READER);
3054 3054 for (i = 0; i < nmemcallbacks; i++) {
3055 3055 if (mem_callbacks[i].vec != NULL) {
3056 3056 (*mem_callbacks[i].vec->post_add)
3057 3057 (mem_callbacks[i].arg, delta_pages);
3058 3058 }
3059 3059 }
3060 3060 rw_exit(&mem_callback_rwlock);
3061 3061 }
3062 3062
3063 3063 /*
3064 3064 * Note the locking between pre_del and post_del: The reader lock is held
3065 3065 * between the two calls to stop the set of functions from changing.
3066 3066 */
3067 3067
3068 3068 static int
3069 3069 kphysm_setup_pre_del(pgcnt_t delta_pages)
3070 3070 {
3071 3071 uint_t i;
3072 3072 int ret;
3073 3073 int aret;
3074 3074
3075 3075 ret = 0;
3076 3076 rw_enter(&mem_callback_rwlock, RW_READER);
3077 3077 for (i = 0; i < nmemcallbacks; i++) {
3078 3078 if (mem_callbacks[i].vec != NULL) {
3079 3079 aret = (*mem_callbacks[i].vec->pre_del)
3080 3080 (mem_callbacks[i].arg, delta_pages);
3081 3081 ret |= aret;
3082 3082 }
3083 3083 }
3084 3084
3085 3085 return (ret);
3086 3086 }
3087 3087
3088 3088 static void
3089 3089 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
3090 3090 {
3091 3091 uint_t i;
3092 3092
3093 3093 for (i = 0; i < nmemcallbacks; i++) {
3094 3094 if (mem_callbacks[i].vec != NULL) {
3095 3095 (*mem_callbacks[i].vec->post_del)
3096 3096 (mem_callbacks[i].arg, delta_pages, cancelled);
3097 3097 }
3098 3098 }
3099 3099 rw_exit(&mem_callback_rwlock);
3100 3100 }
3101 3101
3102 3102 static int
3103 3103 kphysm_split_memseg(
3104 3104 pfn_t base,
3105 3105 pgcnt_t npgs)
3106 3106 {
3107 3107 struct memseg *seg;
3108 3108 struct memseg **segpp;
3109 3109 pgcnt_t size_low, size_high;
3110 3110 struct memseg *seg_low, *seg_mid, *seg_high;
3111 3111
3112 3112 /*
3113 3113 * Lock the memsegs list against other updates now
3114 3114 */
3115 3115 memsegs_lock(1);
3116 3116
3117 3117 /*
3118 3118 * Find boot time memseg that wholly covers this area.
3119 3119 */
3120 3120
3121 3121 /* First find the memseg with page 'base' in it. */
3122 3122 for (segpp = &memsegs; (seg = *segpp) != NULL;
3123 3123 segpp = &((*segpp)->next)) {
3124 3124 if (base >= seg->pages_base && base < seg->pages_end)
3125 3125 break;
3126 3126 }
3127 3127 if (seg == NULL) {
3128 3128 memsegs_unlock(1);
3129 3129 return (0);
3130 3130 }
3131 3131 if (memseg_includes_meta(seg)) {
3132 3132 memsegs_unlock(1);
3133 3133 return (0);
3134 3134 }
3135 3135 if ((base + npgs) > seg->pages_end) {
3136 3136 memsegs_unlock(1);
3137 3137 return (0);
3138 3138 }
3139 3139
3140 3140 /*
3141 3141 * Work out the size of the two segments that will
3142 3142 * surround the new segment, one for low address
3143 3143 * and one for high.
3144 3144 */
3145 3145 ASSERT(base >= seg->pages_base);
3146 3146 size_low = base - seg->pages_base;
3147 3147 ASSERT(seg->pages_end >= (base + npgs));
3148 3148 size_high = seg->pages_end - (base + npgs);
3149 3149
3150 3150 /*
3151 3151 * Sanity check.
3152 3152 */
3153 3153 if ((size_low + size_high) == 0) {
3154 3154 memsegs_unlock(1);
3155 3155 return (0);
3156 3156 }
3157 3157
3158 3158 /*
3159 3159 * Allocate the new structures. The old memseg will not be freed
3160 3160 * as there may be a reference to it.
3161 3161 */
3162 3162 seg_low = NULL;
3163 3163 seg_high = NULL;
3164 3164
3165 3165 if (size_low != 0)
3166 3166 seg_low = memseg_alloc();
3167 3167
3168 3168 seg_mid = memseg_alloc();
3169 3169
3170 3170 if (size_high != 0)
3171 3171 seg_high = memseg_alloc();
3172 3172
3173 3173 /*
3174 3174 * All allocation done now.
3175 3175 */
3176 3176 if (size_low != 0) {
3177 3177 seg_low->pages = seg->pages;
3178 3178 seg_low->epages = seg_low->pages + size_low;
3179 3179 seg_low->pages_base = seg->pages_base;
3180 3180 seg_low->pages_end = seg_low->pages_base + size_low;
3181 3181 seg_low->next = seg_mid;
3182 3182 seg_low->msegflags = seg->msegflags;
3183 3183 }
3184 3184 if (size_high != 0) {
3185 3185 seg_high->pages = seg->epages - size_high;
3186 3186 seg_high->epages = seg_high->pages + size_high;
3187 3187 seg_high->pages_base = seg->pages_end - size_high;
3188 3188 seg_high->pages_end = seg_high->pages_base + size_high;
3189 3189 seg_high->next = seg->next;
3190 3190 seg_high->msegflags = seg->msegflags;
3191 3191 }
3192 3192
3193 3193 seg_mid->pages = seg->pages + size_low;
3194 3194 seg_mid->pages_base = seg->pages_base + size_low;
3195 3195 seg_mid->epages = seg->epages - size_high;
3196 3196 seg_mid->pages_end = seg->pages_end - size_high;
3197 3197 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
3198 3198 seg_mid->msegflags = seg->msegflags;
3199 3199
3200 3200 /*
3201 3201 * Update hat_kpm specific info of all involved memsegs and
3202 3202 * allow hat_kpm specific global chain updates.
3203 3203 */
3204 3204 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
3205 3205
3206 3206 /*
3207 3207 * At this point we have two equivalent memseg sub-chains,
3208 3208 * seg and seg_low/seg_mid/seg_high, which both chain on to
3209 3209 * the same place in the global chain. By re-writing the pointer
3210 3210 * in the previous element we switch atomically from using the old
3211 3211 * (seg) to the new.
3212 3212 */
3213 3213 *segpp = (seg_low != NULL) ? seg_low : seg_mid;
3214 3214
3215 3215 membar_enter();
3216 3216
3217 3217 build_pfn_hash();
3218 3218 memsegs_unlock(1);
3219 3219
3220 3220 /*
3221 3221 * We leave the old segment, 'seg', intact as there may be
3222 3222 * references to it. Also, as the value of total_pages has not
3223 3223 * changed and the memsegs list is effectively the same when
3224 3224 * accessed via the old or the new pointer, we do not have to
3225 3225 * cause pageout_scanner() to re-evaluate its hand pointers.
3226 3226 *
3227 3227 * We currently do not re-use or reclaim the page_t memory.
3228 3228 * If we do, then this may have to change.
3229 3229 */
3230 3230
3231 3231 mutex_enter(&memseg_lists_lock);
3232 3232 seg->lnext = memseg_edit_junk;
3233 3233 memseg_edit_junk = seg;
3234 3234 mutex_exit(&memseg_lists_lock);
3235 3235
3236 3236 return (1);
3237 3237 }
3238 3238
3239 3239 /*
3240 3240 * The sfmmu hat layer (e.g.) accesses some parts of the memseg
3241 3241 * structure using physical addresses. Therefore a kmem_cache is
3242 3242 * used with KMC_NOHASH to avoid page crossings within a memseg
3243 3243 * structure. KMC_NOHASH requires that no external (outside of
3244 3244 * slab) information is allowed. This, in turn, implies that the
3245 3245 * cache's slabsize must be exactly a single page, since per-slab
3246 3246 * information (e.g. the freelist for the slab) is kept at the
3247 3247 * end of the slab, where it is easy to locate. Should be changed
3248 3248 * when a more obvious kmem_cache interface/flag will become
3249 3249 * available.
3250 3250 */
3251 3251 void
3252 3252 mem_config_init()
3253 3253 {
3254 3254 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
3255 3255 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
3256 3256 }
3257 3257
3258 3258 struct memseg *
3259 3259 memseg_alloc()
3260 3260 {
3261 3261 struct memseg *seg;
3262 3262
3263 3263 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
3264 3264 bzero(seg, sizeof (struct memseg));
3265 3265
3266 3266 return (seg);
3267 3267 }
3268 3268
3269 3269 /*
3270 3270 * Return whether the page_t memory for this memseg
3271 3271 * is included in the memseg itself.
3272 3272 */
3273 3273 static int
3274 3274 memseg_includes_meta(struct memseg *seg)
3275 3275 {
3276 3276 return (seg->msegflags & MEMSEG_META_INCL);
3277 3277 }
3278 3278
3279 3279 pfn_t
3280 3280 memseg_get_start(struct memseg *seg)
3281 3281 {
3282 3282 pfn_t pt_start;
3283 3283
3284 3284 if (memseg_includes_meta(seg)) {
3285 3285 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
3286 3286
3287 3287 /* Meta data is required to be at the beginning */
3288 3288 ASSERT(pt_start < seg->pages_base);
3289 3289 } else
3290 3290 pt_start = seg->pages_base;
3291 3291
3292 3292 return (pt_start);
3293 3293 }
3294 3294
↓ open down ↓ |
3294 lines elided |
↑ open up ↑ |
3295 3295 /*
3296 3296 * Invalidate memseg pointers in cpu private vm data caches.
3297 3297 */
3298 3298 static void
3299 3299 memseg_cpu_vm_flush()
3300 3300 {
3301 3301 cpu_t *cp;
3302 3302 vm_cpu_data_t *vc;
3303 3303
3304 3304 mutex_enter(&cpu_lock);
3305 - pause_cpus(NULL);
3305 + pause_cpus(NULL, NULL);
3306 3306
3307 3307 cp = cpu_list;
3308 3308 do {
3309 3309 vc = cp->cpu_vm_data;
3310 3310 vc->vc_pnum_memseg = NULL;
3311 3311 vc->vc_pnext_memseg = NULL;
3312 3312
3313 3313 } while ((cp = cp->cpu_next) != cpu_list);
3314 3314
3315 3315 start_cpus();
3316 3316 mutex_exit(&cpu_lock);
3317 3317 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX