Print this page
XXXX pass in cpu_pause_func via pause_cpus
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4v/os/mpo.c
+++ new/usr/src/uts/sun4v/os/mpo.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 #include <sys/types.h>
28 28 #include <sys/sysmacros.h>
29 29 #include <sys/machsystm.h>
30 30 #include <sys/machparam.h>
31 31 #include <sys/cmn_err.h>
32 32 #include <sys/stat.h>
33 33 #include <sys/mach_descrip.h>
34 34 #include <sys/memnode.h>
35 35 #include <sys/mdesc.h>
36 36 #include <sys/mpo.h>
37 37 #include <vm/page.h>
38 38 #include <vm/vm_dep.h>
39 39 #include <vm/hat_sfmmu.h>
40 40 #include <sys/promif.h>
41 41
42 42 /*
43 43 * MPO and the sun4v memory representation
44 44 * ---------------------------------------
45 45 *
46 46 * Latency groups are defined in the sun4v achitecture by memory-latency-group
47 47 * nodes in the Machine Description, as specified in FWARC/2007/260. These
48 48 * tie together cpu nodes and mblock nodes, and contain mask and match
49 49 * properties that identify the portion of an mblock that belongs to the
50 50 * lgroup. Mask and match are defined in the Physical Address (PA) space,
51 51 * but an mblock defines Real Addresses (RA). To translate, the mblock
52 52 * includes the property address-congruence-offset, hereafter referred to as
53 53 * ra_to_pa. A real address ra is a member of an lgroup if
54 54 *
55 55 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
56 56 *
57 57 * The MD is traversed, and information on all mblocks is kept in the array
58 58 * mpo_mblock[]. Information on all CPUs, including which lgroup they map
59 59 * to, is kept in the array mpo_cpu[].
60 60 *
61 61 * This implementation makes (and verifies) the simplifying assumption that
62 62 * the mask bits are the same for all defined lgroups, and that all 1 bits in
63 63 * the mask are contiguous. Thus the number of lgroups is bounded by the
64 64 * number of possible mask values, and the lgrp_handle_t is defined as the
65 65 * mask value, shifted right to eliminate the 0 bit positions in mask. The
66 66 * masks and values are also referred to as "home bits" in the code.
67 67 *
68 68 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
69 69 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
70 70 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
71 71 * home bits. This yields the mem_node.
72 72 *
73 73 * Interfaces
74 74 * ----------
75 75 *
76 76 * This file exports the following entry points:
77 77 *
78 78 * plat_lgrp_init()
79 79 * plat_build_mem_nodes()
80 80 * plat_lgrp_cpu_to_hand()
81 81 * plat_lgrp_latency()
82 82 * plat_pfn_to_mem_node()
83 83 * These implement the usual platform lgroup interfaces.
84 84 *
85 85 * plat_rapfn_to_papfn()
86 86 * Recover the PA page coloring bits from an RA.
87 87 *
88 88 * plat_mem_node_iterator_init()
89 89 * Initialize an iterator to efficiently step through pages in a mem_node.
90 90 *
91 91 * plat_mem_node_intersect_range()
92 92 * Find the intersection with a mem_node.
93 93 *
94 94 * plat_slice_add()
95 95 * plat_slice_del()
96 96 * Platform hooks to add/delete a pfn range.
97 97 *
98 98 * Internal Organization
99 99 * ---------------------
100 100 *
101 101 * A number of routines are used both boot/DR code which (re)build
102 102 * appropriate MPO structures.
103 103 *
104 104 * mblock_alloc()
105 105 * Allocate memory for mblocks and stripes as
106 106 * appropriate for boot or memory DR.
107 107 *
108 108 * mblock_free()
109 109 * Free memory allocated by mblock_alloc.
110 110 *
111 111 * mblock_update()
112 112 * Build mblocks based on mblock nodes read from the MD.
113 113 *
114 114 * mblock_update_add()
115 115 * Rebuild mblocks after a memory DR add operation.
116 116 *
117 117 * mblock_update_del()
118 118 * Rebuild mblocks after a memory DR delete operation.
119 119 *
120 120 * mblock_install()
121 121 * Install mblocks as the new configuration.
122 122 *
123 123 * mstripe_update()
124 124 * Build stripes based on mblocks.
125 125 *
126 126 * mnode_update()
127 127 * Call memnode layer to add/del a pfn range, based on stripes.
128 128 *
129 129 * The platform interfaces allocate all memory required for the
130 130 * particualar update first, block access to the MPO structures
131 131 * while they are updated, and free old structures after the update.
132 132 */
133 133
134 134 int sun4v_mpo_enable = 1;
135 135 int sun4v_mpo_debug = 0;
136 136 char sun4v_mpo_status[256] = "";
137 137
138 138 /* Save CPU info from the MD and associate CPUs with lgroups */
139 139 static struct cpu_md mpo_cpu[NCPU];
140 140
141 141 /* Save lgroup info from the MD */
142 142 #define MAX_MD_LGROUPS 32
143 143 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
144 144 static int n_lgrpnodes = 0;
145 145 static int n_locality_groups = 0;
146 146 static int max_locality_groups = 0;
147 147 static int szc_mask0 = 0;
148 148
149 149 /* Save mblocks from the MD */
150 150 #define SMALL_MBLOCKS_COUNT 8
151 151 static struct mblock_md *mpo_mblock;
152 152 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
153 153 static int n_mblocks = 0;
154 154
155 155 /* Save mem_node stripes calculate from mblocks and lgroups. */
156 156 static mem_stripe_t *mem_stripes;
157 157 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
158 158 static int n_mem_stripes = 0;
159 159 static pfn_t mnode_stride; /* distance between stripes, start to start */
160 160 static int stripe_shift; /* stride/stripes expressed as a shift */
161 161 static pfn_t mnode_pages; /* mem_node stripe width */
162 162
163 163 /* Save home mask and shift used to calculate lgrp_handle_t values */
164 164 static uint64_t home_mask = 0;
165 165 static pfn_t home_mask_pfn = 0;
166 166 static int home_mask_shift = 0;
167 167 static uint_t home_mask_pfn_shift = 0;
168 168
169 169 /* Save lowest and highest latencies found across all lgroups */
170 170 static int lower_latency = 0;
171 171 static int higher_latency = 0;
172 172
173 173 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */
174 174 static int mpo_genid; /* config gen; updated by mem DR */
175 175 static mpo_config_t mpo_config; /* current mblocks and stripes */
176 176
177 177 typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t;
178 178
179 179 static int valid_pages(md_t *md, mde_cookie_t cpu0);
180 180 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset);
181 181 static int fix_interleave(void);
182 182
183 183 static int mblock_alloc(mpo_config_t *, update_t, int nmblocks);
184 184 static void mblock_install(mpo_config_t *);
185 185 static void mblock_free(mpo_config_t *);
186 186 static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes);
187 187 static void mblock_update_add(mpo_config_t *);
188 188 static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t);
189 189 static void mstripe_update(mpo_config_t *);
190 190 static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t);
191 191
192 192 /* Debug support */
193 193 #if defined(DEBUG) && !defined(lint)
194 194 #define VALIDATE_SLICE(base, end) { \
195 195 ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M))); \
196 196 ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M))); \
197 197 }
198 198 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
199 199 #else
200 200 #define VALIDATE_SLICE(base, end)
201 201 #define MPO_DEBUG(...)
202 202 #endif /* DEBUG */
203 203
204 204 /* Record status message, viewable from mdb */
205 205 #define MPO_STATUS(args...) { \
206 206 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \
207 207 MPO_DEBUG(sun4v_mpo_status); \
208 208 }
209 209
210 210 /*
211 211 * The MPO locks are to protect the MPO metadata while that
212 212 * information is updated as a result of a memory DR operation.
↓ open down ↓ |
212 lines elided |
↑ open up ↑ |
213 213 * The read lock must be acquired to read the metadata and the
214 214 * write locks must be acquired to update it.
215 215 */
216 216 #define mpo_rd_lock kpreempt_disable
217 217 #define mpo_rd_unlock kpreempt_enable
218 218
219 219 static void
220 220 mpo_wr_lock()
221 221 {
222 222 mutex_enter(&cpu_lock);
223 - pause_cpus(NULL);
223 + pause_cpus(NULL, NULL);
224 224 mutex_exit(&cpu_lock);
225 225 }
226 226
227 227 static void
228 228 mpo_wr_unlock()
229 229 {
230 230 mutex_enter(&cpu_lock);
231 231 start_cpus();
232 232 mutex_exit(&cpu_lock);
233 233 }
234 234
235 235 /*
236 236 * Routine to read a uint64_t from a given md
237 237 */
238 238 static int64_t
239 239 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
240 240 {
241 241 int err = md_get_prop_val(md, node, propname, val);
242 242 return (err);
243 243 }
244 244
245 245 static int
246 246 mblock_cmp(const void *a, const void *b)
247 247 {
248 248 struct mblock_md *m1 = (struct mblock_md *)a;
249 249 struct mblock_md *m2 = (struct mblock_md *)b;
250 250
251 251 if (m1->base < m2->base)
252 252 return (-1);
253 253 else if (m1->base == m2->base)
254 254 return (0);
255 255 else
256 256 return (1);
257 257 }
258 258
259 259 static void
260 260 mblock_sort(struct mblock_md *mblocks, int n)
261 261 {
262 262 extern void qsort(void *, size_t, size_t,
263 263 int (*)(const void *, const void *));
264 264
265 265 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
266 266 }
267 267
268 268 static void
269 269 mpo_update_tunables(void)
270 270 {
271 271 int i, ncpu_min;
272 272
273 273 /*
274 274 * lgrp_expand_proc_thresh is the minimum load on the lgroups
275 275 * this process is currently running on before considering
276 276 * expanding threads to another lgroup.
277 277 *
278 278 * lgrp_expand_proc_diff determines how much less the remote lgroup
279 279 * must be loaded before expanding to it.
280 280 *
281 281 * On sun4v CMT processors, threads share a core pipeline, and
282 282 * at less than 100% utilization, best throughput is obtained by
283 283 * spreading threads across more cores, even if some are in a
284 284 * different lgroup. Spread threads to a new lgroup if the
285 285 * current group is more than 50% loaded. Because of virtualization,
286 286 * lgroups may have different numbers of CPUs, but the tunables
287 287 * apply to all lgroups, so find the smallest lgroup and compute
288 288 * 50% loading.
289 289 */
290 290
291 291 ncpu_min = NCPU;
292 292 for (i = 0; i < n_lgrpnodes; i++) {
293 293 int ncpu = mpo_lgroup[i].ncpu;
294 294 if (ncpu != 0 && ncpu < ncpu_min)
295 295 ncpu_min = ncpu;
296 296 }
297 297 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
298 298
299 299 /* new home may only be half as loaded as the existing home to use it */
300 300 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
301 301
302 302 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
303 303 }
304 304
305 305 static mde_cookie_t
306 306 cpuid_to_cpunode(md_t *md, int cpuid)
307 307 {
308 308 mde_cookie_t rootnode, foundnode, *cpunodes;
309 309 uint64_t cpuid_prop;
310 310 int n_cpunodes, i;
311 311
312 312 if (md == NULL)
313 313 return (MDE_INVAL_ELEM_COOKIE);
314 314
315 315 rootnode = md_root_node(md);
316 316 if (rootnode == MDE_INVAL_ELEM_COOKIE)
317 317 return (MDE_INVAL_ELEM_COOKIE);
318 318
319 319 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
320 320 "fwd", &cpunodes);
321 321 if (n_cpunodes <= 0 || n_cpunodes > NCPU)
322 322 goto cpuid_fail;
323 323
324 324 for (i = 0; i < n_cpunodes; i++) {
325 325 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
326 326 &cpuid_prop))
327 327 break;
328 328 if (cpuid_prop == (uint64_t)cpuid) {
329 329 foundnode = cpunodes[i];
330 330 md_free_scan_dag(md, &cpunodes);
331 331 return (foundnode);
332 332 }
333 333 }
334 334 cpuid_fail:
335 335 if (n_cpunodes > 0)
336 336 md_free_scan_dag(md, &cpunodes);
337 337 return (MDE_INVAL_ELEM_COOKIE);
338 338 }
339 339
340 340 static int
341 341 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
342 342 {
343 343 mde_cookie_t *nodes;
344 344 uint64_t latency, lowest_latency;
345 345 uint64_t address_match, lowest_address_match;
346 346 int n_lgroups, j, result = 0;
347 347
348 348 /* Find lgroup nodes reachable from this cpu */
349 349 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
350 350 "fwd", &nodes);
351 351
352 352 lowest_latency = ~(0UL);
353 353
354 354 /* Find the lgroup node with the smallest latency */
355 355 for (j = 0; j < n_lgroups; j++) {
356 356 result = get_int(md, nodes[j], PROP_LG_LATENCY,
357 357 &latency);
358 358 result |= get_int(md, nodes[j], PROP_LG_MATCH,
359 359 &address_match);
360 360 if (result != 0) {
361 361 j = -1;
362 362 goto to_lgrp_done;
363 363 }
364 364 if (latency < lowest_latency) {
365 365 lowest_latency = latency;
366 366 lowest_address_match = address_match;
367 367 }
368 368 }
369 369 for (j = 0; j < n_lgrpnodes; j++) {
370 370 if ((mpo_lgroup[j].latency == lowest_latency) &&
371 371 (mpo_lgroup[j].addr_match == lowest_address_match))
372 372 break;
373 373 }
374 374 if (j == n_lgrpnodes)
375 375 j = -1;
376 376
377 377 to_lgrp_done:
378 378 if (n_lgroups > 0)
379 379 md_free_scan_dag(md, &nodes);
380 380 return (j);
381 381 }
382 382
383 383 /* Called when DR'ing in a CPU */
384 384 void
385 385 mpo_cpu_add(md_t *md, int cpuid)
386 386 {
387 387 mde_cookie_t cpunode;
388 388
389 389 int i;
390 390
391 391 if (n_lgrpnodes <= 0)
392 392 return;
393 393
394 394 if (md == NULL)
395 395 goto add_fail;
396 396
397 397 cpunode = cpuid_to_cpunode(md, cpuid);
398 398 if (cpunode == MDE_INVAL_ELEM_COOKIE)
399 399 goto add_fail;
400 400
401 401 i = mpo_cpu_to_lgroup(md, cpunode);
402 402 if (i == -1)
403 403 goto add_fail;
404 404
405 405 mpo_cpu[cpuid].lgrp_index = i;
406 406 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
407 407 mpo_lgroup[i].ncpu++;
408 408 mpo_update_tunables();
409 409 return;
410 410 add_fail:
411 411 panic("mpo_cpu_add: Cannot read MD");
412 412 }
413 413
414 414 /* Called when DR'ing out a CPU */
415 415 void
416 416 mpo_cpu_remove(int cpuid)
417 417 {
418 418 int i;
419 419
420 420 if (n_lgrpnodes <= 0)
421 421 return;
422 422
423 423 i = mpo_cpu[cpuid].lgrp_index;
424 424 mpo_lgroup[i].ncpu--;
425 425 mpo_cpu[cpuid].home = 0;
426 426 mpo_cpu[cpuid].lgrp_index = -1;
427 427 mpo_update_tunables();
428 428 }
429 429
430 430 static mde_cookie_t
431 431 md_get_root(md_t *md)
432 432 {
433 433 mde_cookie_t root = MDE_INVAL_ELEM_COOKIE;
434 434 int n_nodes;
435 435
436 436 n_nodes = md_node_count(md);
437 437
438 438 if (n_nodes <= 0) {
439 439 MPO_STATUS("md_get_root: No nodes in node count\n");
440 440 return (root);
441 441 }
442 442
443 443 root = md_root_node(md);
444 444
445 445 if (root == MDE_INVAL_ELEM_COOKIE) {
446 446 MPO_STATUS("md_get_root: Root node is missing\n");
447 447 return (root);
448 448 }
449 449
450 450 MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes);
451 451 MPO_DEBUG("md_get_root: md: %p\n", md);
452 452 MPO_DEBUG("md_get_root: root: %lx\n", root);
453 453 done:
454 454 return (root);
455 455 }
456 456
457 457 static int
458 458 lgrp_update(md_t *md, mde_cookie_t root)
459 459 {
460 460 int i, j, result;
461 461 int ret_val = 0;
462 462 int sub_page_fix;
463 463 mde_cookie_t *nodes, *lgrpnodes;
464 464
465 465 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
466 466 "fwd", &lgrpnodes);
467 467
468 468 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
469 469 MPO_STATUS("lgrp_update: No Lgroups\n");
470 470 ret_val = -1;
471 471 goto fail;
472 472 }
473 473
474 474 MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes);
475 475
476 476 for (i = 0; i < n_lgrpnodes; i++) {
477 477 mpo_lgroup[i].node = lgrpnodes[i];
478 478 mpo_lgroup[i].id = i;
479 479 mpo_lgroup[i].ncpu = 0;
480 480 result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
481 481 &mpo_lgroup[i].addr_mask);
482 482 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
483 483 &mpo_lgroup[i].addr_match);
484 484
485 485 /*
486 486 * If either the mask or match properties are missing, set to 0
487 487 */
488 488 if (result < 0) {
489 489 mpo_lgroup[i].addr_mask = 0;
490 490 mpo_lgroup[i].addr_match = 0;
491 491 }
492 492
493 493 /* Set latency to 0 if property not present */
494 494
495 495 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
496 496 &mpo_lgroup[i].latency);
497 497 if (result < 0)
498 498 mpo_lgroup[i].latency = 0;
499 499 }
500 500
501 501 /*
502 502 * Sub-page level interleave is not yet supported. Check for it,
503 503 * and remove sub-page interleaved lgroups from mpo_lgroup and
504 504 * n_lgrpnodes. If no lgroups are left, return.
505 505 */
506 506
507 507 sub_page_fix = fix_interleave();
508 508 if (n_lgrpnodes == 0) {
509 509 ret_val = -1;
510 510 goto fail;
511 511 }
512 512
513 513 /* Ensure that all of the addr_mask values are the same */
514 514
515 515 for (i = 0; i < n_lgrpnodes; i++) {
516 516 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
517 517 MPO_STATUS("lgrp_update: "
518 518 "addr_mask values are not the same\n");
519 519 ret_val = -1;
520 520 goto fail;
521 521 }
522 522 }
523 523
524 524 /*
525 525 * Ensure that all lgrp nodes see all the mblocks. However, if
526 526 * sub-page interleave is being fixed, they do not, so skip
527 527 * the check.
528 528 */
529 529
530 530 if (sub_page_fix == 0) {
531 531 for (i = 0; i < n_lgrpnodes; i++) {
532 532 j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
533 533 PROP_LG_MBLOCK, "fwd", &nodes);
534 534 md_free_scan_dag(md, &nodes);
535 535 if (j != n_mblocks) {
536 536 MPO_STATUS("lgrp_update: "
537 537 "sub-page interleave is being fixed\n");
538 538 ret_val = -1;
539 539 goto fail;
540 540 }
541 541 }
542 542 }
543 543 fail:
544 544 if (n_lgrpnodes > 0) {
545 545 md_free_scan_dag(md, &lgrpnodes);
546 546 for (i = 0; i < n_lgrpnodes; i++)
547 547 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
548 548 }
549 549
550 550 return (ret_val);
551 551 }
552 552
553 553 /*
554 554 *
555 555 * Traverse the MD to determine:
556 556 *
557 557 * Number of CPU nodes, lgrp_nodes, and mblocks
558 558 * Then for each lgrp_node, obtain the appropriate data.
559 559 * For each CPU, determine its home locality and store it.
560 560 * For each mblock, retrieve its data and store it.
561 561 */
562 562 static int
563 563 lgrp_traverse(md_t *md)
564 564 {
565 565 mde_cookie_t root, *cpunodes, *mblocknodes;
566 566 int o;
567 567 uint64_t i, k, stripe, stride;
568 568 uint64_t mem_lg_homeset = 0;
569 569 int ret_val = 0;
570 570 int result = 0;
571 571 int n_cpunodes = 0;
572 572 mpo_config_t new_config;
573 573
574 574 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) {
575 575 ret_val = -1;
576 576 goto fail;
577 577 }
578 578
579 579 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
580 580 &mblocknodes);
581 581 if (n_mblocks <= 0) {
582 582 MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
583 583 "Descriptor\n");
584 584 ret_val = -1;
585 585 goto fail;
586 586 }
587 587
588 588 /*
589 589 * Build the Memory Nodes. Do this before any possibility of
590 590 * bailing from this routine so we obtain ra_to_pa (needed for page
591 591 * coloring) even when there are no lgroups defined.
592 592 */
593 593 if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) {
594 594 ret_val = -1;
595 595 goto fail;
596 596 }
597 597
598 598 mblock_update(&new_config, md, mblocknodes);
599 599 mblock_install(&new_config);
600 600
601 601 /* Page coloring hook is required so we can iterate through mnodes */
602 602 if (&page_next_pfn_for_color_cpu == NULL) {
603 603 MPO_STATUS("lgrp_traverse: No page coloring support\n");
604 604 ret_val = -1;
605 605 goto fail;
606 606 }
607 607
608 608 /* Global enable for mpo */
609 609 if (sun4v_mpo_enable == 0) {
610 610 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
611 611 ret_val = -1;
612 612 goto fail;
613 613 }
614 614
615 615 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
616 616
617 617 if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
618 618 MPO_STATUS("lgrp_traverse: No CPU nodes detected "
619 619 "in MD\n");
620 620 ret_val = -1;
621 621 goto fail;
622 622 }
623 623
624 624 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
625 625
626 626 if ((ret_val = lgrp_update(md, root)) == -1)
627 627 goto fail;
628 628
629 629 /*
630 630 * Use the address mask from the first lgroup node
631 631 * to establish our home_mask.
632 632 */
633 633 home_mask = mpo_lgroup[0].addr_mask;
634 634 home_mask_pfn = btop(home_mask);
635 635 home_mask_shift = lowbit(home_mask) - 1;
636 636 home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
637 637 mnode_pages = btop(1ULL << home_mask_shift);
638 638
639 639 /*
640 640 * How many values are possible in home mask? Assume the mask
641 641 * bits are contiguous.
642 642 */
643 643 max_locality_groups =
644 644 1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
645 645
646 646 stripe_shift = highbit(max_locality_groups) - 1;
647 647 stripe = ptob(mnode_pages);
648 648 stride = max_locality_groups * stripe;
649 649 mnode_stride = btop(stride);
650 650
651 651 /* Now verify the home mask bits are contiguous */
652 652
653 653 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
654 654 MPO_STATUS("lgrp_traverse: "
655 655 "home mask bits are not contiguous\n");
656 656 ret_val = -1;
657 657 goto fail;
658 658 }
659 659
660 660 /* Record all of the home bits */
661 661
662 662 for (i = 0; i < n_lgrpnodes; i++) {
663 663 HOMESET_ADD(mem_lg_homeset,
664 664 mpo_lgroup[i].addr_match >> home_mask_shift);
665 665 }
666 666
667 667 /* Count the number different "home" mem_lg's we've discovered */
668 668
669 669 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
670 670
671 671 /* If we have only 1 locality group then we can exit */
672 672 if (n_locality_groups == 1) {
673 673 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
674 674 ret_val = -1;
675 675 goto fail;
676 676 }
677 677
678 678 /*
679 679 * Set the latencies. A CPU's lgroup is defined by the lowest
680 680 * latency found. All other memory is considered remote, and the
681 681 * remote latency is represented by the highest latency found.
682 682 * Thus hierarchical lgroups, if any, are approximated by a
683 683 * two level scheme.
684 684 *
685 685 * The Solaris MPO framework by convention wants to see latencies
686 686 * in units of nano-sec/10. In the MD, the units are defined to be
687 687 * pico-seconds.
688 688 */
689 689
690 690 lower_latency = mpo_lgroup[0].latency;
691 691 higher_latency = mpo_lgroup[0].latency;
692 692
693 693 for (i = 1; i < n_lgrpnodes; i++) {
694 694 if (mpo_lgroup[i].latency < lower_latency) {
695 695 lower_latency = mpo_lgroup[i].latency;
696 696 }
697 697 if (mpo_lgroup[i].latency > higher_latency) {
698 698 higher_latency = mpo_lgroup[i].latency;
699 699 }
700 700 }
701 701 lower_latency /= 10000;
702 702 higher_latency /= 10000;
703 703
704 704 /* Clear our CPU data */
705 705
706 706 for (i = 0; i < NCPU; i++) {
707 707 mpo_cpu[i].home = 0;
708 708 mpo_cpu[i].lgrp_index = -1;
709 709 }
710 710
711 711 /* Build the CPU nodes */
712 712 for (i = 0; i < n_cpunodes; i++) {
713 713
714 714 /* Read in the lgroup nodes */
715 715 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
716 716 if (result < 0) {
717 717 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
718 718 ret_val = -1;
719 719 goto fail;
720 720 }
721 721
722 722 o = mpo_cpu_to_lgroup(md, cpunodes[i]);
723 723 if (o == -1) {
724 724 ret_val = -1;
725 725 goto fail;
726 726 }
727 727 mpo_cpu[k].lgrp_index = o;
728 728 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
729 729 mpo_lgroup[o].ncpu++;
730 730 }
731 731 /* Validate that no large pages cross mnode boundaries. */
732 732 if (valid_pages(md, cpunodes[0]) == 0) {
733 733 ret_val = -1;
734 734 goto fail;
735 735 }
736 736
737 737 fail:
738 738 if (n_cpunodes > 0)
739 739 md_free_scan_dag(md, &cpunodes);
740 740 if (n_mblocks > 0)
741 741 md_free_scan_dag(md, &mblocknodes);
742 742 else
743 743 panic("lgrp_traverse: No memory blocks found");
744 744
745 745 if (ret_val == 0) {
746 746 MPO_STATUS("MPO feature is enabled.\n");
747 747 } else
748 748 sun4v_mpo_enable = 0; /* set this for DR */
749 749
750 750 return (ret_val);
751 751 }
752 752
753 753 /*
754 754 * Determine the number of unique mem_lg's present in our system
755 755 */
756 756 static int
757 757 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
758 758 {
759 759 int homeid;
760 760 int count = 0;
761 761
762 762 /*
763 763 * Scan the "home" bits of the mem_lgs, count
764 764 * the number that are unique.
765 765 */
766 766
767 767 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
768 768 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
769 769 count++;
770 770 }
771 771 }
772 772
773 773 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
774 774 mem_lg_homeset);
775 775 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
776 776
777 777 /* Default must be at least one */
778 778 if (count == 0)
779 779 count = 1;
780 780
781 781 return (count);
782 782 }
783 783
784 784 /*
785 785 * Platform specific lgroup initialization
786 786 */
787 787 void
788 788 plat_lgrp_init(void)
789 789 {
790 790 md_t *md;
791 791 int rc;
792 792
793 793 /* Get the Machine Descriptor handle */
794 794
795 795 md = md_get_handle();
796 796
797 797 /* If not, we cannot continue */
798 798
799 799 if (md == NULL) {
800 800 panic("cannot access machine descriptor\n");
801 801 } else {
802 802 rc = lgrp_traverse(md);
803 803 (void) md_fini_handle(md);
804 804 }
805 805
806 806 /*
807 807 * If we can't process the MD for lgroups then at least let the
808 808 * system try to boot. Assume we have one lgroup so that
809 809 * when plat_build_mem_nodes is called, it will attempt to init
810 810 * an mnode based on the supplied memory segment.
811 811 */
812 812
813 813 if (rc == -1) {
814 814 home_mask_pfn = 0;
815 815 max_locality_groups = 1;
816 816 n_locality_groups = 1;
817 817 return;
818 818 }
819 819
820 820 mem_node_pfn_shift = 0;
821 821 mem_node_physalign = 0;
822 822
823 823 /* Use lgroup-aware TSB allocations */
824 824 tsb_lgrp_affinity = 1;
825 825
826 826 /* Require that a home lgroup have some memory to be chosen */
827 827 lgrp_mem_free_thresh = 1;
828 828
829 829 /* Standard home-on-next-touch policy */
830 830 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
831 831
832 832 /* Disable option to choose root lgroup if all leaf lgroups are busy */
833 833 lgrp_load_thresh = UINT32_MAX;
834 834
835 835 mpo_update_tunables();
836 836 }
837 837
838 838 /*
839 839 * Helper routine for debugging calls to mem_node_add_slice()
840 840 */
841 841 static void
842 842 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
843 843 {
844 844 #if defined(DEBUG) && !defined(lint)
845 845 static int slice_count = 0;
846 846
847 847 slice_count++;
848 848 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n",
849 849 slice_count, basepfn, endpfn);
850 850 #endif
851 851 mem_node_add_slice(basepfn, endpfn);
852 852 }
853 853
854 854 static void
855 855 mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn)
856 856 {
857 857 #if defined(DEBUG) && !defined(lint)
858 858 static int slice_count = 0;
859 859
860 860 slice_count++;
861 861 MPO_DEBUG("mem_del_slice(%d): basepfn: %lx endpfn: %lx\n",
862 862 slice_count, basepfn, endpfn);
863 863 #endif
864 864 mem_node_del_slice(basepfn, endpfn);
865 865 }
866 866
867 867 /*
868 868 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
869 869 */
870 870 static void
871 871 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
872 872 {
873 873 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
874 874 "mnode index: %d\n", plathand, mnode);
875 875 plat_assign_lgrphand_to_mem_node(plathand, mnode);
876 876 }
877 877
878 878 /*
879 879 * plat_build_mem_nodes()
880 880 *
881 881 * Define the mem_nodes based on the modified boot memory list,
882 882 * or based on info read from the MD in plat_lgrp_init().
883 883 *
884 884 * When the home mask lies in the middle of the address bits (as it does on
885 885 * Victoria Falls), then the memory in one mem_node is no longer contiguous;
886 886 * it is striped across an mblock in a repeating pattern of contiguous memory
887 887 * followed by a gap. The stripe width is the size of the contiguous piece.
888 888 * The stride is the distance from the start of one contiguous piece to the
889 889 * start of the next. The gap is thus stride - stripe_width.
890 890 *
891 891 * The stripe of an mnode that falls within an mblock is described by the type
892 892 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The
893 893 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into
894 894 * this array is predetermined. The mem_stripe_t that describes mnode m
895 895 * within mpo_mblock[i] is stored at
896 896 * mem_stripes[ m + i * max_locality_groups ]
897 897 *
898 898 * max_locality_groups is the total number of possible locality groups,
899 899 * as defined by the size of the home mask, even if the memory assigned
900 900 * to the domain is small and does not cover all the lgroups. Thus some
901 901 * mem_stripe_t's may be empty.
902 902 *
903 903 * The members of mem_stripe_t are:
904 904 * physbase: First valid page in mem_node in the corresponding mblock
905 905 * physmax: Last valid page in mem_node in mblock
906 906 * offset: The full stripe width starts at physbase - offset.
907 907 * Thus if offset is non-zero, this mem_node starts in the middle
908 908 * of a stripe width, and the second full stripe starts at
909 909 * physbase - offset + stride. (even though physmax may fall in the
910 910 * middle of a stripe width, we do not save the ending fragment size
911 911 * in this data structure.)
912 912 * exists: Set to 1 if the mblock has memory in this mem_node stripe.
913 913 *
914 914 * The stripe width is kept in the global mnode_pages.
915 915 * The stride is kept in the global mnode_stride.
916 916 * All the above use pfn's as the unit.
917 917 *
918 918 * As an example, the memory layout for a domain with 2 mblocks and 4
919 919 * mem_nodes 0,1,2,3 could look like this:
920 920 *
921 921 * 123012301230 ... 012301230123 ...
922 922 * mblock 0 mblock 1
923 923 */
924 924
925 925 /*ARGSUSED*/
926 926 void
927 927 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
928 928 {
929 929 int elem;
930 930 uint64_t base, len;
931 931
932 932 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
933 933 max_mem_nodes = max_locality_groups;
934 934
935 935 mstripe_update(&mpo_config);
936 936
937 937 /* Check for non-MPO sun4v platforms */
938 938 if (n_locality_groups <= 1) {
939 939 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
940 940 for (elem = 0; elem < nelems; list++, elem++) {
941 941 base = list->addr;
942 942 len = list->size;
943 943
944 944 mpo_mem_node_add_slice(btop(base),
945 945 btop(base + len - 1));
946 946 }
947 947 mem_node_pfn_shift = 0;
948 948 mem_node_physalign = 0;
949 949 } else
950 950 mnode_update(&mpo_config, 0, 0, U_ADD_ALL);
951 951
952 952 /*
953 953 * Indicate to vm_pagelist that the hpm_counters array
954 954 * should be shared because the ranges overlap.
955 955 */
956 956 if (max_mem_nodes > 1) {
957 957 interleaved_mnodes = 1;
958 958 }
959 959 }
960 960
961 961 /*
962 962 * Return the locality group value for the supplied processor
963 963 */
964 964 lgrp_handle_t
965 965 plat_lgrp_cpu_to_hand(processorid_t id)
966 966 {
967 967 lgrp_handle_t lgrphand;
968 968
969 969 mpo_rd_lock();
970 970 if (n_locality_groups > 1) {
971 971 lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home;
972 972 } else {
973 973 lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */
974 974 }
975 975 mpo_rd_unlock();
976 976
977 977 return (lgrphand);
978 978 }
979 979
980 980 int
981 981 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
982 982 {
983 983 /*
984 984 * Return min remote latency when there are more than two lgroups
985 985 * (root and child) and getting latency between two different lgroups
986 986 * or root is involved.
987 987 */
988 988 if (lgrp_optimizations() && (from != to ||
989 989 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
990 990 return ((int)higher_latency);
991 991 } else {
992 992 return ((int)lower_latency);
993 993 }
994 994 }
995 995
996 996 int
997 997 plat_pfn_to_mem_node(pfn_t pfn)
998 998 {
999 999 int i, mnode;
1000 1000 pfn_t ra_to_pa_pfn;
1001 1001 struct mblock_md *mb;
1002 1002
1003 1003 if (n_locality_groups <= 1)
1004 1004 return (0);
1005 1005
1006 1006 /*
1007 1007 * The mnode is defined to be 1:1 with the lgroup handle, which
1008 1008 * is taken from from the home bits. Find the mblock in which
1009 1009 * the pfn falls to get the ra_to_pa adjustment, and extract
1010 1010 * the home bits.
1011 1011 */
1012 1012 mpo_rd_lock();
1013 1013 mb = &mpo_mblock[0];
1014 1014 for (i = 0; i < n_mblocks; i++) {
1015 1015 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1016 1016 ra_to_pa_pfn = btop(mb->ra_to_pa);
1017 1017 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1018 1018 home_mask_pfn_shift);
1019 1019 ASSERT(mnode < max_mem_nodes);
1020 1020 mpo_rd_unlock();
1021 1021 return (mnode);
1022 1022 }
1023 1023 mb++;
1024 1024 }
1025 1025
1026 1026 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1027 1027 return (pfn);
1028 1028 }
1029 1029
1030 1030 /*
1031 1031 * plat_rapfn_to_papfn
1032 1032 *
1033 1033 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034 1034 * and home mask bits are correct. The upper bits do not necessarily
1035 1035 * match the actual PA, however.
1036 1036 */
1037 1037 pfn_t
1038 1038 plat_rapfn_to_papfn(pfn_t pfn)
1039 1039 {
1040 1040 int i;
1041 1041 pfn_t ra_to_pa_pfn;
1042 1042 struct mblock_md *mb;
1043 1043
1044 1044 ASSERT(n_mblocks > 0);
1045 1045 if (n_mblocks == 1)
1046 1046 return (pfn + base_ra_to_pa_pfn);
1047 1047
1048 1048 /*
1049 1049 * Find the mblock in which the pfn falls
1050 1050 * in order to get the ra_to_pa adjustment.
1051 1051 */
1052 1052 mpo_rd_lock();
1053 1053 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1054 1054 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1055 1055 ra_to_pa_pfn = btop(mb->ra_to_pa);
1056 1056 mpo_rd_unlock();
1057 1057 return (pfn + ra_to_pa_pfn);
1058 1058 }
1059 1059 }
1060 1060
1061 1061 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1062 1062 return (pfn);
1063 1063 }
1064 1064
1065 1065 /*
1066 1066 * plat_mem_node_iterator_init()
1067 1067 * Initialize cookie "it" to iterate over pfn's in an mnode. There is
1068 1068 * no additional iterator function. The caller uses the info from
1069 1069 * the iterator structure directly.
1070 1070 *
1071 1071 * pfn: starting pfn.
1072 1072 * mnode: desired mnode.
1073 1073 * szc: desired page size.
1074 1074 * init:
1075 1075 * if 1, start a new traversal, initialize "it", find first
1076 1076 * mblock containing pfn, and return its starting pfn
1077 1077 * within the mnode.
1078 1078 * if 0, continue the previous traversal using passed-in data
1079 1079 * from "it", advance to the next mblock, and return its
1080 1080 * starting pfn within the mnode.
1081 1081 * it: returns readonly data to the caller; see below.
1082 1082 *
1083 1083 * The input pfn must be aligned for the page size szc.
1084 1084 *
1085 1085 * Returns: starting pfn for the iteration for the mnode/mblock,
1086 1086 * which is aligned according to the page size,
1087 1087 * or returns (pfn_t)(-1) if the input pfn lies past the last
1088 1088 * valid pfn of the mnode.
1089 1089 * Returns misc values in the "it" struct that allows the caller
1090 1090 * to advance the pfn within an mblock using address arithmetic;
1091 1091 * see definition of mem_node_iterator_t in vm_dep.h.
1092 1092 * When the caller calculates a pfn that is greater than the
1093 1093 * returned value it->mi_mblock_end, the caller should again
1094 1094 * call plat_mem_node_iterator_init, passing init=0.
1095 1095 *
1096 1096 * The last mblock in continuation case may be invalid because
1097 1097 * of memory DR. To detect this situation mi_genid is checked
1098 1098 * against mpo_genid which is incremented after a memory DR
1099 1099 * operation. See also plat_slice_add()/plat_slice_del().
1100 1100 */
1101 1101 pfn_t
1102 1102 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1103 1103 mem_node_iterator_t *it, int init)
1104 1104 {
1105 1105 int i;
1106 1106 pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1107 1107 struct mblock_md *mblock;
1108 1108 pfn_t base, end;
1109 1109 mem_stripe_t *ms;
1110 1110 uint64_t szcpagesize;
1111 1111
1112 1112 ASSERT(it != NULL);
1113 1113 ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1114 1114 ASSERT(n_mblocks > 0);
1115 1115 ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1116 1116
1117 1117 mpo_rd_lock();
1118 1118
1119 1119 if (init || (it->mi_genid != mpo_genid)) {
1120 1120 it->mi_genid = mpo_genid;
1121 1121 it->mi_last_mblock = 0;
1122 1122 it->mi_init = 1;
1123 1123 }
1124 1124
1125 1125 /* Check if mpo is not enabled and we only have one mblock */
1126 1126 if (n_locality_groups == 1 && n_mblocks == 1) {
1127 1127 if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) {
1128 1128 pfn = (pfn_t)-1;
1129 1129 goto done;
1130 1130 }
1131 1131 it->mi_mnode = mnode;
1132 1132 it->mi_ra_to_pa = base_ra_to_pa_pfn;
1133 1133 it->mi_mnode_pfn_mask = 0;
1134 1134 it->mi_mnode_pfn_shift = 0;
1135 1135 it->mi_mnode_mask = 0;
1136 1136 it->mi_mblock_base = mem_node_config[mnode].physbase;
1137 1137 it->mi_mblock_end = mem_node_config[mnode].physmax;
1138 1138 if (pfn < it->mi_mblock_base)
1139 1139 pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1140 1140 if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1141 1141 pfn = (pfn_t)-1;
1142 1142 goto done;
1143 1143 }
1144 1144
1145 1145 /* init=1 means begin iterator, init=0 means continue */
1146 1146 if (init == 1) {
1147 1147 i = 0;
1148 1148 } else {
1149 1149 ASSERT(it->mi_last_mblock < n_mblocks);
1150 1150 i = it->mi_last_mblock;
1151 1151 ASSERT(pfn >
1152 1152 mem_stripes[i * max_locality_groups + mnode].physmax);
1153 1153 if (++i == n_mblocks) {
1154 1154 pfn = (pfn_t)-1;
1155 1155 goto done;
1156 1156 }
1157 1157 }
1158 1158
1159 1159 /*
1160 1160 * Find mblock that contains pfn for mnode's stripe, or first such an
1161 1161 * mblock after pfn, else pfn is out of bound and we'll return -1.
1162 1162 * mblocks and stripes are sorted in ascending address order.
1163 1163 */
1164 1164 szcpagesize = szcpgcnt << PAGESHIFT;
1165 1165 for (; i < n_mblocks; i++) {
1166 1166 if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1167 1167 continue;
1168 1168 ms = &mem_stripes[i * max_locality_groups + mnode];
1169 1169 if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1170 1170 (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1171 1171 ms->physmax)
1172 1172 break;
1173 1173 }
1174 1174 if (i == n_mblocks) {
1175 1175 it->mi_last_mblock = i - 1;
1176 1176 pfn = (pfn_t)-1;
1177 1177 goto done;
1178 1178 }
1179 1179
1180 1180 it->mi_last_mblock = i;
1181 1181
1182 1182 mblock = &mpo_mblock[i];
1183 1183 base = ms->physbase;
1184 1184 end = ms->physmax;
1185 1185
1186 1186 it->mi_mnode = mnode;
1187 1187 it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1188 1188 it->mi_mblock_base = base;
1189 1189 it->mi_mblock_end = end;
1190 1190 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */
1191 1191 it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1192 1192 it->mi_mnode_mask = max_locality_groups - 1;
1193 1193 if (pfn < base) {
1194 1194 pfn = P2ROUNDUP(base, szcpgcnt);
1195 1195 ASSERT(pfn + szcpgcnt - 1 <= end);
1196 1196 }
1197 1197 ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
1198 1198 done:
1199 1199 mpo_rd_unlock();
1200 1200 return (pfn);
1201 1201 }
1202 1202
1203 1203 /*
1204 1204 * plat_mem_node_intersect_range()
1205 1205 *
1206 1206 * Find the intersection between a memnode and a range of pfn's.
1207 1207 */
1208 1208 void
1209 1209 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1210 1210 int mnode, pgcnt_t *npages_out)
1211 1211 {
1212 1212 pfn_t offset, len, hole, base, end, test_end, frag;
1213 1213 pfn_t nearest;
1214 1214 mem_stripe_t *ms;
1215 1215 int i, npages;
1216 1216
1217 1217 *npages_out = 0;
1218 1218
1219 1219 if (!mem_node_config[mnode].exists || test_len == 0)
1220 1220 return;
1221 1221
1222 1222 base = mem_node_config[mnode].physbase;
1223 1223 end = mem_node_config[mnode].physmax;
1224 1224
1225 1225 test_end = test_base + test_len - 1;
1226 1226 if (end < test_base || base > test_end)
1227 1227 return;
1228 1228
1229 1229 if (n_locality_groups == 1) {
1230 1230 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1231 1231 return;
1232 1232 }
1233 1233
1234 1234 hole = mnode_stride - mnode_pages;
1235 1235 npages = 0;
1236 1236
1237 1237 /*
1238 1238 * Iterate over all the stripes for this mnode (one per mblock),
1239 1239 * find the intersection with each, and accumulate the intersections.
1240 1240 *
1241 1241 * Determing the intersection with a stripe is tricky. If base or end
1242 1242 * fall outside the mem_node bounds, round them to physbase/physmax of
1243 1243 * mem_node. If base or end fall in a gap, round them to start of
1244 1244 * nearest stripe. If they fall within a stripe, keep base or end,
1245 1245 * but calculate the fragment size that should be excluded from the
1246 1246 * stripe. Calculate how many strides fall in the adjusted range,
1247 1247 * multiply by stripe width, and add the start and end fragments.
1248 1248 */
1249 1249
1250 1250 mpo_rd_lock();
1251 1251 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1252 1252 ms = &mem_stripes[i];
1253 1253 if (ms->exists &&
1254 1254 test_base <= (end = ms->physmax) &&
1255 1255 test_end >= (base = ms->physbase)) {
1256 1256
1257 1257 offset = ms->offset;
1258 1258
1259 1259 if (test_base > base) {
1260 1260 /* Round test_base to next multiple of stride */
1261 1261 len = P2ROUNDUP(test_base - (base - offset),
1262 1262 mnode_stride);
1263 1263 nearest = base - offset + len;
1264 1264 /*
1265 1265 * Compute distance from test_base to the
1266 1266 * stride boundary to see if test_base falls
1267 1267 * in the stripe or in the hole.
1268 1268 */
1269 1269 if (nearest - test_base > hole) {
1270 1270 /*
1271 1271 * test_base lies in stripe,
1272 1272 * and offset should be excluded.
1273 1273 */
1274 1274 offset = test_base -
1275 1275 (nearest - mnode_stride);
1276 1276 base = test_base;
1277 1277 } else {
1278 1278 /* round up to next stripe start */
1279 1279 offset = 0;
1280 1280 base = nearest;
1281 1281 if (base > end)
1282 1282 continue;
1283 1283 }
1284 1284
1285 1285 }
1286 1286
1287 1287 if (test_end < end)
1288 1288 end = test_end;
1289 1289 end++; /* adjust to an exclusive bound */
1290 1290
1291 1291 /* Round end to next multiple of stride */
1292 1292 len = P2ROUNDUP(end - (base - offset), mnode_stride);
1293 1293 nearest = (base - offset) + len;
1294 1294 if (nearest - end <= hole) {
1295 1295 /* end falls in hole, use entire last stripe */
1296 1296 frag = 0;
1297 1297 } else {
1298 1298 /* end falls in stripe, compute fragment */
1299 1299 frag = nearest - hole - end;
1300 1300 }
1301 1301
1302 1302 len = (len >> stripe_shift) - offset - frag;
1303 1303 npages += len;
1304 1304 }
1305 1305 }
1306 1306
1307 1307 *npages_out = npages;
1308 1308 mpo_rd_unlock();
1309 1309 }
1310 1310
1311 1311 /*
1312 1312 * valid_pages()
1313 1313 *
1314 1314 * Return 1 if pages are valid and do not cross mnode boundaries
1315 1315 * (which would break page free list assumptions), and 0 otherwise.
1316 1316 */
1317 1317
1318 1318 #define MNODE(pa) \
1319 1319 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1320 1320
1321 1321 static int
1322 1322 valid_pages(md_t *md, mde_cookie_t cpu0)
1323 1323 {
1324 1324 int i, max_szc;
1325 1325 uint64_t last_page_base, szc_mask;
1326 1326 uint64_t max_page_len, max_coalesce_len;
1327 1327 struct mblock_md *mb = mpo_mblock;
1328 1328
1329 1329 /*
1330 1330 * Find the smaller of the largest page possible and supported.
1331 1331 * mmu_exported_pagesize_mask is not yet initialized, so read
1332 1332 * it from the MD. Apply minimal fixups in case of broken MDs
1333 1333 * to get a sane mask.
1334 1334 */
1335 1335
1336 1336 if (cpu0 == NULL)
1337 1337 szc_mask = szc_mask0;
1338 1338 else {
1339 1339 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1340 1340 szc_mask = 0;
1341 1341 /* largest in sun4v default support */
1342 1342 szc_mask |= (1 << TTE4M);
1343 1343 szc_mask0 = szc_mask;
1344 1344 }
1345 1345 max_szc = highbit(szc_mask) - 1;
1346 1346 if (max_szc > TTE256M)
1347 1347 max_szc = TTE256M;
1348 1348 max_page_len = TTEBYTES(max_szc);
1349 1349
1350 1350 /*
1351 1351 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352 1352 * if mmu-page-size-list does not contain it, so 256M pages must fall
1353 1353 * within one mnode to use MPO.
1354 1354 */
1355 1355 max_coalesce_len = TTEBYTES(TTE256M);
1356 1356 ASSERT(max_coalesce_len >= max_page_len);
1357 1357
1358 1358 if (ptob(mnode_pages) < max_coalesce_len) {
1359 1359 MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360 1360 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1361 1361 return (0);
1362 1362 }
1363 1363
1364 1364 for (i = 0; i < n_mblocks; i++) {
1365 1365 uint64_t base = mb->base;
1366 1366 uint64_t end = mb->base + mb->size - 1;
1367 1367 uint64_t ra_to_pa = mb->ra_to_pa;
1368 1368
1369 1369 /*
1370 1370 * If mblock is smaller than the max page size, then
1371 1371 * RA = PA mod MAXPAGE is not guaranteed, but it must
1372 1372 * not span mnodes.
1373 1373 */
1374 1374 if (mb->size < max_page_len) {
1375 1375 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1376 1376 MPO_STATUS("Small mblock spans mnodes; "
1377 1377 "MPO disabled: base = %lx, end = %lx, "
1378 1378 "ra2pa = %lx\n", base, end, ra_to_pa);
1379 1379 return (0);
1380 1380 }
1381 1381 } else {
1382 1382 /* Verify RA = PA mod MAXPAGE, using coalesce size */
1383 1383 uint64_t pa_base = base + ra_to_pa;
1384 1384 if ((base & (max_coalesce_len - 1)) !=
1385 1385 (pa_base & (max_coalesce_len - 1))) {
1386 1386 MPO_STATUS("bad page alignment; MPO disabled: "
1387 1387 "ra = %lx, pa = %lx, pagelen = %lx\n",
1388 1388 base, pa_base, max_coalesce_len);
1389 1389 return (0);
1390 1390 }
1391 1391 }
1392 1392
1393 1393 /*
1394 1394 * Find start of last large page in mblock in RA space.
1395 1395 * If page extends into the next mblock, verify the
1396 1396 * mnode does not change.
1397 1397 */
1398 1398 last_page_base = P2ALIGN(end, max_coalesce_len);
1399 1399 if (i + 1 < n_mblocks &&
1400 1400 last_page_base + max_coalesce_len > mb[1].base &&
1401 1401 MNODE(last_page_base + ra_to_pa) !=
1402 1402 MNODE(mb[1].base + mb[1].ra_to_pa)) {
1403 1403 MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404 1404 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405 1405 "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1406 1406 mb[1].ra_to_pa, max_coalesce_len);
1407 1407 return (0);
1408 1408 }
1409 1409
1410 1410 mb++;
1411 1411 }
1412 1412 return (1);
1413 1413 }
1414 1414
1415 1415
1416 1416 /*
1417 1417 * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418 1418 * if any, and remove them. This yields a config where the "coarse
1419 1419 * grained" lgroups cover all of memory, even though part of that memory
1420 1420 * is fine grain interleaved and does not deliver a purely local memory
1421 1421 * latency.
1422 1422 *
1423 1423 * This function reads and modifies the globals:
1424 1424 * mpo_lgroup[], n_lgrpnodes
1425 1425 *
1426 1426 * Returns 1 if lgroup nodes were removed, 0 otherwise.
1427 1427 */
1428 1428
1429 1429 static int
1430 1430 fix_interleave(void)
1431 1431 {
1432 1432 int i, j;
1433 1433 uint64_t mask = 0;
1434 1434
1435 1435 j = 0;
1436 1436 for (i = 0; i < n_lgrpnodes; i++) {
1437 1437 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1438 1438 /* remove this lgroup */
1439 1439 mask = mpo_lgroup[i].addr_mask;
1440 1440 } else {
1441 1441 mpo_lgroup[j++] = mpo_lgroup[i];
1442 1442 }
1443 1443 }
1444 1444 n_lgrpnodes = j;
1445 1445
1446 1446 if (mask != 0)
1447 1447 MPO_STATUS("sub-page interleave %lx found; "
1448 1448 "removing lgroup.\n", mask);
1449 1449
1450 1450 return (mask != 0);
1451 1451 }
1452 1452
1453 1453 /*
1454 1454 * mblock_alloc
1455 1455 *
1456 1456 * Allocate memory for mblock an stripe arrays from either static or
1457 1457 * dynamic space depending on utype, and return the result in mc.
1458 1458 * Returns 0 on success and -1 on error.
1459 1459 */
1460 1460
1461 1461 static int
1462 1462 mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks)
1463 1463 {
1464 1464 mblock_md_t *mb = NULL;
1465 1465 mem_stripe_t *ms = NULL;
1466 1466 int nstripes = MAX_MEM_NODES * nmblocks;
1467 1467 size_t mblocksz = nmblocks * sizeof (struct mblock_md);
1468 1468 size_t mstripesz = nstripes * sizeof (mem_stripe_t);
1469 1469 size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
1470 1470
1471 1471 /*
1472 1472 * Allocate space for mblocks and mstripes.
1473 1473 *
1474 1474 * For DR allocations, just use kmem_alloc(), and set
1475 1475 * mc_alloc_sz to indicate it was used.
1476 1476 *
1477 1477 * For boot allocation:
1478 1478 * If we have a small number of mblocks we will use the space
1479 1479 * that we preallocated. Otherwise, we will dynamically
1480 1480 * allocate the space from the prom and map it to the
1481 1481 * reserved VA at MPOBUF_BASE.
1482 1482 */
1483 1483
1484 1484 if (utype == U_ADD || utype == U_DEL) {
1485 1485 mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP);
1486 1486 ms = (mem_stripe_t *)(mb + nmblocks);
1487 1487 mc->mc_alloc_sz = allocsz;
1488 1488 } else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
1489 1489 mb = &small_mpo_mblocks[0];
1490 1490 ms = &small_mem_stripes[0];
1491 1491 mc->mc_alloc_sz = 0;
1492 1492 } else {
1493 1493 /* Ensure that we dont request more space than reserved */
1494 1494 if (allocsz > MPOBUF_SIZE) {
1495 1495 MPO_STATUS("mblock_alloc: Insufficient space "
1496 1496 "for mblock structures \n");
1497 1497 return (-1);
1498 1498 }
1499 1499 mb = (struct mblock_md *)
1500 1500 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
1501 1501 if (mb != (struct mblock_md *)MPOBUF_BASE) {
1502 1502 MPO_STATUS("mblock_alloc: Cannot allocate space "
1503 1503 "for mblocks \n");
1504 1504 return (-1);
1505 1505 }
1506 1506 mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
1507 1507 mpo_heap32_bufsz = MPOBUF_SIZE;
1508 1508 ms = (mem_stripe_t *)(mb + nmblocks);
1509 1509 mc->mc_alloc_sz = 0;
1510 1510 }
1511 1511 mc->mc_mblocks = mb;
1512 1512 mc->mc_stripes = ms;
1513 1513 mc->mc_nmblocks = nmblocks;
1514 1514 mc->mc_nstripes = nstripes;
1515 1515 MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks);
1516 1516 return (0);
1517 1517 }
1518 1518
1519 1519 /*
1520 1520 * mblock_free
1521 1521 *
1522 1522 * Free memory in mc that was allocated by mblock_alloc.
1523 1523 */
1524 1524
1525 1525 static void
1526 1526 mblock_free(mpo_config_t *mc)
1527 1527 {
1528 1528 if (mc->mc_alloc_sz > 0) {
1529 1529 ASSERT(mc->mc_mblocks != mpo_mblock);
1530 1530 kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz);
1531 1531 }
1532 1532 bzero(mc, sizeof (*mc));
1533 1533 }
1534 1534
1535 1535 /*
1536 1536 * mblock_install
1537 1537 *
1538 1538 * Install mblock config passed in mc as the global configuration.
1539 1539 * May only be called at boot or while holding mpo_wr_lock.
1540 1540 */
1541 1541
1542 1542 static void
1543 1543 mblock_install(mpo_config_t *mc)
1544 1544 {
1545 1545 mpo_mblock = mc->mc_mblocks;
1546 1546 n_mblocks = mc->mc_nmblocks;
1547 1547 mem_stripes = mc->mc_stripes;
1548 1548 n_mem_stripes = mc->mc_nstripes;
1549 1549 base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa);
1550 1550 mpo_config = *mc;
1551 1551 }
1552 1552
1553 1553 /*
1554 1554 * mblock_update
1555 1555 *
1556 1556 * Traverse mblocknodes, read the mblock properties from the MD, and
1557 1557 * save the mblocks in mc.
1558 1558 */
1559 1559
1560 1560 static void
1561 1561 mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes)
1562 1562 {
1563 1563 uint64_t i, j;
1564 1564 int result = 0;
1565 1565 mblock_md_t *mblock = mc->mc_mblocks;
1566 1566
1567 1567 for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
1568 1568
1569 1569 /* Without a base or size value we will fail */
1570 1570 result = get_int(md, mblocknodes[j], PROP_LG_BASE,
1571 1571 &mblock[i].base);
1572 1572 if (result < 0) {
1573 1573 MPO_STATUS("mblock_update: "
1574 1574 "PROP_LG_BASE is missing\n");
1575 1575 mc->mc_nmblocks = 0;
1576 1576 return;
1577 1577 }
1578 1578
1579 1579 result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
1580 1580 &mblock[i].size);
1581 1581 if (result < 0) {
1582 1582 MPO_STATUS("mblock_update: "
1583 1583 "PROP_LG_SIZE is missing\n");
1584 1584 mc->mc_nmblocks = 0;
1585 1585 return;
1586 1586 }
1587 1587
1588 1588 result = get_int(md, mblocknodes[j],
1589 1589 PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa);
1590 1590
1591 1591 /* If we don't have an ra_pa_offset, just set it to 0 */
1592 1592 if (result < 0)
1593 1593 mblock[i].ra_to_pa = 0;
1594 1594
1595 1595 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
1596 1596 "ra_to_pa = %lx\n", i,
1597 1597 mblock[i].base,
1598 1598 mblock[i].size,
1599 1599 mblock[i].ra_to_pa);
1600 1600
1601 1601 /* check for unsupportable values of base and size */
1602 1602 if (mblock[i].base > mblock[i].base + mblock[i].size) {
1603 1603 MPO_STATUS("mblock_update: "
1604 1604 "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
1605 1605 "base = %lx, size = %lx\n",
1606 1606 mblock[i].base, mblock[i].size);
1607 1607 mc->mc_nmblocks = 0;
1608 1608 return;
1609 1609 }
1610 1610
1611 1611 /* eliminate size==0 blocks */
1612 1612 if (mblock[i].size != 0) {
1613 1613 uint64_t base = mblock[i].base;
1614 1614 uint64_t end = base + mblock[i].size;
1615 1615 ASSERT(end > base);
1616 1616 mblock[i].base_pfn = btop(base);
1617 1617 mblock[i].end_pfn = btop(end - 1);
1618 1618 i++;
1619 1619 }
1620 1620 }
1621 1621
1622 1622 if (i == 0) {
1623 1623 MPO_STATUS("mblock_update: "
1624 1624 "No non-empty mblock nodes were found "
1625 1625 "in the Machine Descriptor\n");
1626 1626 mc->mc_nmblocks = 0;
1627 1627 return;
1628 1628 }
1629 1629 ASSERT(i <= mc->mc_nmblocks);
1630 1630 mc->mc_nmblocks = i;
1631 1631
1632 1632 /* Must sort mblocks by address for mem_node_iterator_init() */
1633 1633 mblock_sort(mblock, mc->mc_nmblocks);
1634 1634 }
1635 1635
1636 1636 /*
1637 1637 * mblock_update_add
1638 1638 *
1639 1639 * Update mblock config after a memory DR add. The added range is not
1640 1640 * needed, as we read *all* mblock nodes from the MD. Save the mblocks
1641 1641 * in mc.
1642 1642 */
1643 1643
1644 1644 static void
1645 1645 mblock_update_add(mpo_config_t *mc)
1646 1646 {
1647 1647 md_t *md;
1648 1648 mde_cookie_t root, *mblocknodes;
1649 1649 int nmblocks = 0;
1650 1650
1651 1651 if ((md = md_get_handle()) == NULL) {
1652 1652 MPO_STATUS("Cannot access Machine Descriptor\n");
1653 1653 goto error;
1654 1654 }
1655 1655
1656 1656 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE)
1657 1657 goto error;
1658 1658
1659 1659 nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
1660 1660 &mblocknodes);
1661 1661 if (nmblocks <= 0) {
1662 1662 MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
1663 1663 goto error;
1664 1664 }
1665 1665
1666 1666 if (mblock_alloc(mc, U_ADD, nmblocks) < 0)
1667 1667 goto error;
1668 1668
1669 1669 mblock_update(mc, md, mblocknodes);
1670 1670 md_free_scan_dag(md, &mblocknodes);
1671 1671 (void) md_fini_handle(md);
1672 1672 return;
1673 1673 error:
1674 1674 panic("mblock_update_add: cannot process mblocks from MD.\n");
1675 1675 }
1676 1676
1677 1677 /*
1678 1678 * mblock_update_del
1679 1679 *
1680 1680 * Update mblocks after a memory DR deletion of the range (ubase, uend).
1681 1681 * Allocate a new mblock config, copy old config to the new, modify the new
1682 1682 * mblocks to reflect the deletion. The new mblocks are returned in
1683 1683 * mc_new and are not yet installed as the active config.
1684 1684 */
1685 1685
1686 1686 static void
1687 1687 mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase,
1688 1688 pfn_t uend)
1689 1689 {
1690 1690 int i, j;
1691 1691 pfn_t base, end;
1692 1692 mblock_md_t *mblock;
1693 1693 int nmblocks = mc_old->mc_nmblocks;
1694 1694
1695 1695 MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend);
1696 1696
1697 1697 /*
1698 1698 * Allocate mblocks in mc_new and copy the old to the new.
1699 1699 * Allocate one extra in case the deletion splits an mblock.
1700 1700 */
1701 1701 if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0)
1702 1702 return;
1703 1703 mblock = mc_new->mc_mblocks;
1704 1704 bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t));
1705 1705
1706 1706 /*
1707 1707 * Find the mblock containing the deleted range and adjust it in
1708 1708 * the new config.
1709 1709 */
1710 1710 for (i = 0; i < nmblocks; i++) {
1711 1711
1712 1712 base = btop(mblock[i].base);
1713 1713 end = base + btop(mblock[i].size) - 1;
1714 1714
1715 1715 /*
1716 1716 * Adjust the mblock based on the subset that was deleted.
1717 1717 *
1718 1718 * If the entire mblk was deleted, compact the table.
1719 1719 *
1720 1720 * If the middle of the mblk was deleted, extend
1721 1721 * the table. Space for the new slot was already
1722 1722 * allocated.
1723 1723 *
1724 1724 * The memory to be deleted is a mblock or a subset of
1725 1725 * and does not span multiple mblocks.
1726 1726 */
1727 1727 if (base == ubase && end == uend) {
1728 1728 for (j = i; j < nmblocks - 1; j++)
1729 1729 mblock[j] = mblock[j + 1];
1730 1730 nmblocks--;
1731 1731 bzero(&mblock[nmblocks], sizeof (*mblock));
1732 1732 break;
1733 1733 } else if (base < ubase && end > uend) {
1734 1734 for (j = nmblocks - 1; j >= i; j--)
1735 1735 mblock[j + 1] = mblock[j];
1736 1736 mblock[i].size = ptob(ubase - base);
1737 1737 mblock[i].end_pfn = ubase - 1;
1738 1738 mblock[i + 1].base = ptob(uend + 1);
1739 1739 mblock[i + 1].size = ptob(end - uend);
1740 1740 mblock[i + 1].base_pfn = uend + 1;
1741 1741 nmblocks++;
1742 1742 break;
1743 1743 } else if (base == ubase) {
1744 1744 MPO_DEBUG("mblock_update_del: shrink>"
1745 1745 " i=%d base=0x%lx end=0x%lx", i, base, end);
1746 1746 mblock[i].base = ptob(uend + 1);
1747 1747 mblock[i].size -= ptob(uend - ubase + 1);
1748 1748 base = uend + 1;
1749 1749 mblock[i].base_pfn = base;
1750 1750 mblock[i].end_pfn = end;
1751 1751 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1752 1752 break;
1753 1753 } else if (end == uend) {
1754 1754 MPO_DEBUG("mblock_update_del: shrink<"
1755 1755 " i=%d base=0x%lx end=0x%lx", i, base, end);
1756 1756 mblock[i].size -= ptob(uend - ubase + 1);
1757 1757 end = ubase - 1;
1758 1758 mblock[i].base_pfn = base;
1759 1759 mblock[i].end_pfn = end;
1760 1760 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1761 1761 break;
1762 1762 }
1763 1763 }
1764 1764 mc_new->mc_nmblocks = nmblocks;
1765 1765 ASSERT(end > base);
1766 1766 }
1767 1767
1768 1768 /*
1769 1769 * mstripe_update
1770 1770 *
1771 1771 * Read mblocks from mc and update mstripes in mc
1772 1772 */
1773 1773
1774 1774 static void
1775 1775 mstripe_update(mpo_config_t *mc)
1776 1776 {
1777 1777 lgrp_handle_t lgrphand, lgrp_start;
1778 1778 int i, mnode;
1779 1779 uint64_t offset, stripe_end, base, end, ra_to_pa, stride;
1780 1780 uint64_t stripe, frag, remove;
1781 1781 mem_stripe_t *ms;
1782 1782 mblock_md_t *mblock = mc->mc_mblocks;
1783 1783 int nmblocks = mc->mc_nmblocks;
1784 1784 int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t);
1785 1785
1786 1786 /* Check for non-MPO sun4v platforms or memory DR removal */
1787 1787 if (n_locality_groups <= 1) {
1788 1788 ASSERT(n_locality_groups == 1);
1789 1789 ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
1790 1790
1791 1791 if (nmblocks == 1) {
1792 1792 mc->mc_nstripes = 0;
1793 1793 } else {
1794 1794 mc->mc_nstripes = nmblocks;
1795 1795 bzero(mc->mc_stripes, mstripesz);
1796 1796 for (i = 0; i < nmblocks; i++) {
1797 1797 mc->mc_stripes[i].exists = 1;
1798 1798 mc->mc_stripes[i].physbase = mblock[i].base_pfn;
1799 1799 mc->mc_stripes[i].physmax = mblock[i].end_pfn;
1800 1800 }
1801 1801 }
1802 1802 return;
1803 1803 }
1804 1804
1805 1805 bzero(mc->mc_stripes, mstripesz);
1806 1806 mc->mc_nstripes = max_locality_groups * nmblocks;
1807 1807 stripe = ptob(mnode_pages);
1808 1808 stride = max_locality_groups * stripe;
1809 1809
1810 1810 for (i = 0; i < nmblocks; i++) {
1811 1811 base = mblock[i].base;
1812 1812 end = base + mblock[i].size;
1813 1813 ra_to_pa = mblock[i].ra_to_pa;
1814 1814
1815 1815 /* Find the offset from the prev stripe boundary in PA space. */
1816 1816 offset = (base + ra_to_pa) & (stripe - 1);
1817 1817
1818 1818 /* Set the next stripe boundary. */
1819 1819 stripe_end = base - offset + stripe;
1820 1820
1821 1821 lgrp_start = (((base + ra_to_pa) & home_mask) >>
1822 1822 home_mask_shift);
1823 1823 lgrphand = lgrp_start;
1824 1824
1825 1825 /*
1826 1826 * Loop over all lgroups covered by the mblock, creating a
1827 1827 * stripe for each. Stop when lgrp_start is visited again.
1828 1828 */
1829 1829 do {
1830 1830 /* mblock may not span all lgroups */
1831 1831 if (base >= end)
1832 1832 break;
1833 1833
1834 1834 mnode = lgrphand;
1835 1835 ASSERT(mnode < max_mem_nodes);
1836 1836
1837 1837 /*
1838 1838 * Calculate the size of the fragment that does not
1839 1839 * belong to the mnode in the last partial stride.
1840 1840 */
1841 1841 frag = (end - (base - offset)) & (stride - 1);
1842 1842 if (frag == 0) {
1843 1843 /* remove the gap */
1844 1844 remove = stride - stripe;
1845 1845 } else if (frag < stripe) {
1846 1846 /* fragment fits in stripe; keep it all */
1847 1847 remove = 0;
1848 1848 } else {
1849 1849 /* fragment is large; trim after whole stripe */
1850 1850 remove = frag - stripe;
1851 1851 }
1852 1852
1853 1853 ms = &mc->mc_stripes[i * max_locality_groups + mnode];
1854 1854 ms->physbase = btop(base);
1855 1855 ms->physmax = btop(end - 1 - remove);
1856 1856 ms->offset = btop(offset);
1857 1857 ms->exists = 1;
1858 1858
1859 1859 base = stripe_end;
1860 1860 stripe_end += stripe;
1861 1861 offset = 0;
1862 1862 lgrphand = (((base + ra_to_pa) & home_mask) >>
1863 1863 home_mask_shift);
1864 1864 } while (lgrphand != lgrp_start);
1865 1865 }
1866 1866 }
1867 1867
1868 1868 #define INTERSECT(a, b, c, d) \
1869 1869 if (((a) >= (c) && (a) <= (d)) || \
1870 1870 ((c) >= (a) && (c) <= (b))) { \
1871 1871 (c) = MAX((a), (c)); \
1872 1872 (d) = MIN((b), (d)); \
1873 1873 } else { \
1874 1874 ASSERT((a) >= (d) || (b) <= (c)); \
1875 1875 continue; \
1876 1876 } \
1877 1877
1878 1878 /*
1879 1879 * mnode_update
1880 1880 *
1881 1881 * Read stripes from mc and update mnode extents. The mnode extents are
1882 1882 * part of the live configuration, so this can only be done at boot time
1883 1883 * or while holding the mpo_wr_lock.
1884 1884 */
1885 1885
1886 1886 static void
1887 1887 mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype)
1888 1888 {
1889 1889 int i, j, mnode, found;
1890 1890 pfn_t base, end;
1891 1891 mem_stripe_t *ms;
1892 1892
1893 1893 MPO_DEBUG("mnode_udpate: basepfn: %lx endpfn: %lx\n", ubase, uend);
1894 1894
1895 1895 if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) {
1896 1896 if (utype == U_ADD)
1897 1897 mpo_mem_node_add_slice(ubase, uend);
1898 1898 else if (utype == U_DEL)
1899 1899 mpo_mem_node_del_slice(ubase, uend);
1900 1900 else
1901 1901 panic("mnode update: %d: invalid\n", utype);
1902 1902 return;
1903 1903 }
1904 1904
1905 1905 found = 0;
1906 1906 for (i = 0; i < mc->mc_nmblocks; i++) {
1907 1907 for (mnode = 0; mnode < max_locality_groups; mnode++) {
1908 1908
1909 1909 j = i * max_locality_groups + mnode;
1910 1910 ms = &mc->mc_stripes[j];
1911 1911 if (!ms->exists)
1912 1912 continue;
1913 1913
1914 1914 base = ms->physbase;
1915 1915 end = ms->physmax;
1916 1916
1917 1917 /*
1918 1918 * Look for the mstripes intersecting this slice.
1919 1919 *
1920 1920 * The mstripe and slice pairs may not be equal
1921 1921 * if a subset of a mblock is added/deleted.
1922 1922 */
1923 1923 switch (utype) {
1924 1924 case U_ADD:
1925 1925 INTERSECT(ubase, uend, base, end);
1926 1926 /*FALLTHROUGH*/
1927 1927 case U_ADD_ALL:
1928 1928 if (n_locality_groups > 1)
1929 1929 mpo_plat_assign_lgrphand_to_mem_node(
1930 1930 mnode, mnode);
1931 1931 mpo_mem_node_add_slice(base, end);
1932 1932 break;
1933 1933 case U_DEL:
1934 1934 INTERSECT(ubase, uend, base, end);
1935 1935 mpo_mem_node_del_slice(base, end);
1936 1936 break;
1937 1937 default:
1938 1938 panic("mnode_update: %d: invalid\n", utype);
1939 1939 break;
1940 1940 }
1941 1941
1942 1942 found++;
1943 1943 }
1944 1944 }
1945 1945
1946 1946 if (!found)
1947 1947 panic("mnode_update: mstripe not found");
1948 1948
1949 1949 #ifdef DEBUG
1950 1950 if (utype == U_ADD_ALL || utype == U_DEL)
1951 1951 return;
1952 1952 found = 0;
1953 1953 for (i = 0; i < max_mem_nodes; i++) {
1954 1954 if (!mem_node_config[i].exists)
1955 1955 continue;
1956 1956 if (ubase >= mem_node_config[i].physbase &&
1957 1957 ubase <= mem_node_config[i].physmax)
1958 1958 found |= 1;
1959 1959 if (uend >= mem_node_config[i].physbase &&
1960 1960 uend <= mem_node_config[i].physmax)
1961 1961 found |= 2;
1962 1962 }
1963 1963 ASSERT(found == 3);
1964 1964 {
1965 1965 pfn_t minpfn, maxpfn;
1966 1966
1967 1967 mem_node_max_range(&minpfn, &maxpfn);
1968 1968 ASSERT(minpfn <= ubase);
1969 1969 ASSERT(maxpfn >= uend);
1970 1970 }
1971 1971 #endif
1972 1972 }
1973 1973
1974 1974 /*
1975 1975 * Plat_slice_add()/plat_slice_del() are the platform hooks
1976 1976 * for adding/deleting a pfn range to/from the system.
1977 1977 *
1978 1978 * Platform_slice_add() is used for both boot/DR cases.
1979 1979 *
1980 1980 * - Zeus has already added the mblocks to the MD, so read the updated
1981 1981 * MD and allocate all data structures required to manage the new memory
1982 1982 * configuration.
1983 1983 *
1984 1984 * - Recompute the stripes which are derived from the mblocks.
1985 1985 *
1986 1986 * - Update (expand) the mnode extents and install the modified mblocks as
1987 1987 * the new mpo config. This must be done while holding the mpo_wr_lock
1988 1988 * to guarantee that no other threads access the mpo meta-data.
1989 1989 *
1990 1990 * - Unlock MPO data structures; the new config is live. Free the old config.
1991 1991 *
1992 1992 * Plat_slice_del() is used for DR only.
1993 1993 *
1994 1994 * - Zeus has not yet modified the MD to reflect the deletion, so copy
1995 1995 * the old mpo mblocks and delete the range from the copy.
1996 1996 *
1997 1997 * - Recompute the stripes which are derived from the mblocks.
1998 1998 *
1999 1999 * - Update (shrink) the mnode extents and install the modified mblocks as
2000 2000 * the new mpo config. This must be done while holding the mpo_wr_lock
2001 2001 * to guarantee that no other threads access the mpo meta-data.
2002 2002 *
2003 2003 * - Unlock MPO data structures; the new config is live. Free the old config.
2004 2004 */
2005 2005
2006 2006 void
2007 2007 plat_slice_add(pfn_t base, pfn_t end)
2008 2008 {
2009 2009 mpo_config_t old_config = mpo_config;
2010 2010 mpo_config_t new_config;
2011 2011
2012 2012 VALIDATE_SLICE(base, end);
2013 2013 mblock_update_add(&new_config);
2014 2014 mstripe_update(&new_config);
2015 2015 mpo_wr_lock();
2016 2016 mblock_install(&new_config);
2017 2017 /* Use new config to add all ranges for mnode_update */
2018 2018 mnode_update(&new_config, base, end, U_ADD);
2019 2019 mpo_genid++;
2020 2020 mpo_wr_unlock();
2021 2021 mblock_free(&old_config);
2022 2022 }
2023 2023
2024 2024 void
2025 2025 plat_slice_del(pfn_t base, pfn_t end)
2026 2026 {
2027 2027 mpo_config_t old_config = mpo_config;
2028 2028 mpo_config_t new_config;
2029 2029
2030 2030 VALIDATE_SLICE(base, end);
2031 2031 mblock_update_del(&new_config, &old_config, base, end);
2032 2032 mstripe_update(&new_config);
2033 2033 mpo_wr_lock();
2034 2034 /* Use old config to find deleted range for mnode_update */
2035 2035 mnode_update(&old_config, base, end, U_DEL);
2036 2036 mblock_install(&new_config);
2037 2037 mpo_genid++;
2038 2038 mpo_wr_unlock();
2039 2039 mblock_free(&old_config);
2040 2040 }
↓ open down ↓ |
1807 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX