Print this page
6146 seg_inherit_notsup is redundant
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/vm/vm_seg.c
+++ new/usr/src/uts/common/vm/vm_seg.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright (c) 2015, Joyent, Inc.
25 25 * Copyright 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
26 26 */
27 27
28 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 29 /* All Rights Reserved */
30 30
31 31 /*
32 32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 33 * The Regents of the University of California
34 34 * All Rights Reserved
35 35 *
36 36 * University Acknowledgment- Portions of this document are derived from
37 37 * software developed by the University of California, Berkeley, and its
38 38 * contributors.
39 39 */
40 40
41 41 /*
42 42 * VM - segment management.
43 43 */
44 44
45 45 #include <sys/types.h>
46 46 #include <sys/inttypes.h>
47 47 #include <sys/t_lock.h>
48 48 #include <sys/param.h>
49 49 #include <sys/systm.h>
50 50 #include <sys/kmem.h>
51 51 #include <sys/sysmacros.h>
52 52 #include <sys/vmsystm.h>
53 53 #include <sys/tuneable.h>
54 54 #include <sys/debug.h>
55 55 #include <sys/fs/swapnode.h>
56 56 #include <sys/cmn_err.h>
57 57 #include <sys/callb.h>
58 58 #include <sys/mem_config.h>
59 59 #include <sys/mman.h>
60 60
61 61 #include <vm/hat.h>
62 62 #include <vm/as.h>
63 63 #include <vm/seg.h>
64 64 #include <vm/seg_kmem.h>
65 65 #include <vm/seg_spt.h>
66 66 #include <vm/seg_vn.h>
67 67 #include <vm/anon.h>
68 68
69 69 /*
70 70 * kstats for segment advise
71 71 */
72 72 segadvstat_t segadvstat = {
73 73 { "MADV_FREE_hit", KSTAT_DATA_ULONG },
74 74 { "MADV_FREE_miss", KSTAT_DATA_ULONG },
75 75 };
76 76
77 77 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
78 78 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
79 79
80 80 /*
81 81 * entry in the segment page cache
82 82 */
83 83 struct seg_pcache {
84 84 struct seg_pcache *p_hnext; /* list for hashed blocks */
85 85 struct seg_pcache *p_hprev;
86 86 pcache_link_t p_plink; /* per segment/amp list */
87 87 void *p_htag0; /* segment/amp pointer */
88 88 caddr_t p_addr; /* base address/anon_idx */
89 89 size_t p_len; /* total bytes */
90 90 size_t p_wlen; /* writtable bytes at p_addr */
91 91 struct page **p_pp; /* pp shadow list */
92 92 seg_preclaim_cbfunc_t p_callback; /* reclaim callback function */
93 93 clock_t p_lbolt; /* lbolt from last use */
94 94 struct seg_phash *p_hashp; /* our pcache hash bucket */
95 95 uint_t p_active; /* active count */
96 96 uchar_t p_write; /* true if S_WRITE */
97 97 uchar_t p_ref; /* reference byte */
98 98 ushort_t p_flags; /* bit flags */
99 99 };
100 100
101 101 struct seg_phash {
102 102 struct seg_pcache *p_hnext; /* list for hashed blocks */
103 103 struct seg_pcache *p_hprev;
104 104 kmutex_t p_hmutex; /* protects hash bucket */
105 105 pcache_link_t p_halink[2]; /* active bucket linkages */
106 106 };
107 107
108 108 struct seg_phash_wired {
109 109 struct seg_pcache *p_hnext; /* list for hashed blocks */
110 110 struct seg_pcache *p_hprev;
111 111 kmutex_t p_hmutex; /* protects hash bucket */
112 112 };
113 113
114 114 /*
115 115 * A parameter to control a maximum number of bytes that can be
116 116 * purged from pcache at a time.
117 117 */
118 118 #define P_MAX_APURGE_BYTES (1024 * 1024 * 1024)
119 119
120 120 /*
121 121 * log2(fraction of pcache to reclaim at a time).
122 122 */
123 123 #define P_SHRINK_SHFT (5)
124 124
125 125 /*
126 126 * The following variables can be tuned via /etc/system.
127 127 */
128 128
129 129 int segpcache_enabled = 1; /* if 1, shadow lists are cached */
130 130 pgcnt_t segpcache_maxwindow = 0; /* max # of pages that can be cached */
131 131 ulong_t segpcache_hashsize_win = 0; /* # of non wired buckets */
132 132 ulong_t segpcache_hashsize_wired = 0; /* # of wired buckets */
133 133 int segpcache_reap_sec = 1; /* reap check rate in secs */
134 134 clock_t segpcache_reap_ticks = 0; /* reap interval in ticks */
135 135 int segpcache_pcp_maxage_sec = 1; /* pcp max age in secs */
136 136 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
137 137 int segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
138 138 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
139 139
140 140 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
141 141 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
142 142 static kcondvar_t seg_pasync_cv;
143 143
144 144 #pragma align 64(pctrl1)
145 145 #pragma align 64(pctrl2)
146 146 #pragma align 64(pctrl3)
147 147
148 148 /*
149 149 * Keep frequently used variables together in one cache line.
150 150 */
151 151 static struct p_ctrl1 {
152 152 uint_t p_disabled; /* if not 0, caching temporarily off */
153 153 pgcnt_t p_maxwin; /* max # of pages that can be cached */
154 154 size_t p_hashwin_sz; /* # of non wired buckets */
155 155 struct seg_phash *p_htabwin; /* hash table for non wired entries */
156 156 size_t p_hashwired_sz; /* # of wired buckets */
157 157 struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
158 158 kmem_cache_t *p_kmcache; /* kmem cache for seg_pcache structs */
159 159 #ifdef _LP64
160 160 ulong_t pad[1];
161 161 #endif /* _LP64 */
162 162 } pctrl1;
163 163
164 164 static struct p_ctrl2 {
165 165 kmutex_t p_mem_mtx; /* protects window counter and p_halinks */
166 166 pgcnt_t p_locked_win; /* # pages from window */
167 167 pgcnt_t p_locked; /* # of pages cached by pagelock */
168 168 uchar_t p_ahcur; /* current active links for insert/delete */
169 169 uchar_t p_athr_on; /* async reclaim thread is running. */
170 170 pcache_link_t p_ahhead[2]; /* active buckets linkages */
171 171 } pctrl2;
172 172
173 173 static struct p_ctrl3 {
174 174 clock_t p_pcp_maxage; /* max pcp age in ticks */
175 175 ulong_t p_athr_empty_ahb; /* athread walk stats */
176 176 ulong_t p_athr_full_ahb; /* athread walk stats */
177 177 pgcnt_t p_maxapurge_npages; /* max pages to purge at a time */
178 178 int p_shrink_shft; /* reap shift factor */
179 179 #ifdef _LP64
180 180 ulong_t pad[3];
181 181 #endif /* _LP64 */
182 182 } pctrl3;
183 183
184 184 #define seg_pdisabled pctrl1.p_disabled
185 185 #define seg_pmaxwindow pctrl1.p_maxwin
186 186 #define seg_phashsize_win pctrl1.p_hashwin_sz
187 187 #define seg_phashtab_win pctrl1.p_htabwin
188 188 #define seg_phashsize_wired pctrl1.p_hashwired_sz
189 189 #define seg_phashtab_wired pctrl1.p_htabwired
190 190 #define seg_pkmcache pctrl1.p_kmcache
191 191 #define seg_pmem_mtx pctrl2.p_mem_mtx
192 192 #define seg_plocked_window pctrl2.p_locked_win
193 193 #define seg_plocked pctrl2.p_locked
194 194 #define seg_pahcur pctrl2.p_ahcur
195 195 #define seg_pathr_on pctrl2.p_athr_on
196 196 #define seg_pahhead pctrl2.p_ahhead
197 197 #define seg_pmax_pcpage pctrl3.p_pcp_maxage
198 198 #define seg_pathr_empty_ahb pctrl3.p_athr_empty_ahb
199 199 #define seg_pathr_full_ahb pctrl3.p_athr_full_ahb
200 200 #define seg_pshrink_shift pctrl3.p_shrink_shft
201 201 #define seg_pmaxapurge_npages pctrl3.p_maxapurge_npages
202 202
203 203 #define P_HASHWIN_MASK (seg_phashsize_win - 1)
204 204 #define P_HASHWIRED_MASK (seg_phashsize_wired - 1)
205 205 #define P_BASESHIFT (6)
206 206
207 207 kthread_t *seg_pasync_thr;
208 208
209 209 extern struct seg_ops segvn_ops;
210 210 extern struct seg_ops segspt_shmops;
211 211
212 212 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
213 213 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
214 214
215 215 #define LBOLT_DELTA(t) ((ulong_t)(ddi_get_lbolt() - (t)))
216 216
217 217 #define PCP_AGE(pcp) LBOLT_DELTA((pcp)->p_lbolt)
218 218
219 219 /*
220 220 * htag0 argument can be a seg or amp pointer.
221 221 */
222 222 #define P_HASHBP(seg, htag0, addr, flags) \
223 223 (IS_PFLAGS_WIRED((flags)) ? \
224 224 ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
225 225 ((uintptr_t)(htag0) >> P_BASESHIFT)]) : \
226 226 (&seg_phashtab_win[P_HASHWIN_MASK & \
227 227 (((uintptr_t)(htag0) >> 3) ^ \
228 228 ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ? \
229 229 (flags >> 16) : page_get_shift((seg)->s_szc))))]))
230 230
231 231 /*
232 232 * htag0 argument can be a seg or amp pointer.
233 233 */
234 234 #define P_MATCH(pcp, htag0, addr, len) \
235 235 ((pcp)->p_htag0 == (htag0) && \
236 236 (pcp)->p_addr == (addr) && \
237 237 (pcp)->p_len >= (len))
238 238
239 239 #define P_MATCH_PP(pcp, htag0, addr, len, pp) \
240 240 ((pcp)->p_pp == (pp) && \
241 241 (pcp)->p_htag0 == (htag0) && \
242 242 (pcp)->p_addr == (addr) && \
243 243 (pcp)->p_len >= (len))
244 244
245 245 #define plink2pcache(pl) ((struct seg_pcache *)((uintptr_t)(pl) - \
246 246 offsetof(struct seg_pcache, p_plink)))
247 247
248 248 #define hlink2phash(hl, l) ((struct seg_phash *)((uintptr_t)(hl) - \
249 249 offsetof(struct seg_phash, p_halink[l])))
250 250
251 251 /*
252 252 * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
253 253 * active hash bucket lists. We maintain active bucket lists to reduce the
254 254 * overhead of finding active buckets during asynchronous purging since there
255 255 * can be 10s of millions of buckets on a large system but only a small subset
256 256 * of them in actual use.
257 257 *
258 258 * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
259 259 * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
260 260 * buckets. The other list is used by asynchronous purge thread. This allows
261 261 * the purge thread to walk its active list without holding seg_pmem_mtx for a
262 262 * long time. When asynchronous thread is done with its list it switches to
263 263 * current active list and makes the list it just finished processing as
264 264 * current active list.
265 265 *
266 266 * seg_padd_abuck() only adds the bucket to current list if the bucket is not
267 267 * yet on any list. seg_premove_abuck() may remove the bucket from either
268 268 * list. If the bucket is on current list it will be always removed. Otherwise
269 269 * the bucket is only removed if asynchronous purge thread is not currently
270 270 * running or seg_premove_abuck() is called by asynchronous purge thread
271 271 * itself. A given bucket can only be on one of active lists at a time. These
272 272 * routines should be called with per bucket lock held. The routines use
273 273 * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
274 274 * the first entry is added to the bucket chain and seg_premove_abuck() must
275 275 * be called after the last pcp entry is deleted from its chain. Per bucket
276 276 * lock should be held by the callers. This avoids a potential race condition
277 277 * when seg_premove_abuck() removes a bucket after pcp entries are added to
278 278 * its list after the caller checked that the bucket has no entries. (this
279 279 * race would cause a loss of an active bucket from the active lists).
280 280 *
281 281 * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
282 282 * New entries are added to the end of the list since LRU is used as the
283 283 * purging policy.
284 284 */
285 285 static void
286 286 seg_padd_abuck(struct seg_phash *hp)
287 287 {
288 288 int lix;
289 289
290 290 ASSERT(MUTEX_HELD(&hp->p_hmutex));
291 291 ASSERT((struct seg_phash *)hp->p_hnext != hp);
292 292 ASSERT((struct seg_phash *)hp->p_hprev != hp);
293 293 ASSERT(hp->p_hnext == hp->p_hprev);
294 294 ASSERT(!IS_PCP_WIRED(hp->p_hnext));
295 295 ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
296 296 ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
297 297 ASSERT(hp >= seg_phashtab_win &&
298 298 hp < &seg_phashtab_win[seg_phashsize_win]);
299 299
300 300 /*
301 301 * This bucket can already be on one of active lists
302 302 * since seg_premove_abuck() may have failed to remove it
303 303 * before.
304 304 */
305 305 mutex_enter(&seg_pmem_mtx);
306 306 lix = seg_pahcur;
307 307 ASSERT(lix >= 0 && lix <= 1);
308 308 if (hp->p_halink[lix].p_lnext != NULL) {
309 309 ASSERT(hp->p_halink[lix].p_lprev != NULL);
310 310 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
311 311 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
312 312 mutex_exit(&seg_pmem_mtx);
313 313 return;
314 314 }
315 315 ASSERT(hp->p_halink[lix].p_lprev == NULL);
316 316
317 317 /*
318 318 * If this bucket is still on list !lix async thread can't yet remove
319 319 * it since we hold here per bucket lock. In this case just return
320 320 * since async thread will eventually find and process this bucket.
321 321 */
322 322 if (hp->p_halink[!lix].p_lnext != NULL) {
323 323 ASSERT(hp->p_halink[!lix].p_lprev != NULL);
324 324 mutex_exit(&seg_pmem_mtx);
325 325 return;
326 326 }
327 327 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
328 328 /*
329 329 * This bucket is not on any active bucket list yet.
330 330 * Add the bucket to the tail of current active list.
331 331 */
332 332 hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
333 333 hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
334 334 seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
335 335 seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
336 336 mutex_exit(&seg_pmem_mtx);
337 337 }
338 338
339 339 static void
340 340 seg_premove_abuck(struct seg_phash *hp, int athr)
341 341 {
342 342 int lix;
343 343
344 344 ASSERT(MUTEX_HELD(&hp->p_hmutex));
345 345 ASSERT((struct seg_phash *)hp->p_hnext == hp);
346 346 ASSERT((struct seg_phash *)hp->p_hprev == hp);
347 347 ASSERT(hp >= seg_phashtab_win &&
348 348 hp < &seg_phashtab_win[seg_phashsize_win]);
349 349
350 350 if (athr) {
351 351 ASSERT(seg_pathr_on);
352 352 ASSERT(seg_pahcur <= 1);
353 353 /*
354 354 * We are called by asynchronous thread that found this bucket
355 355 * on not currently active (i.e. !seg_pahcur) list. Remove it
356 356 * from there. Per bucket lock we are holding makes sure
357 357 * seg_pinsert() can't sneak in and add pcp entries to this
358 358 * bucket right before we remove the bucket from its list.
359 359 */
360 360 lix = !seg_pahcur;
361 361 ASSERT(hp->p_halink[lix].p_lnext != NULL);
362 362 ASSERT(hp->p_halink[lix].p_lprev != NULL);
363 363 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
364 364 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
365 365 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
366 366 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
367 367 hp->p_halink[lix].p_lnext = NULL;
368 368 hp->p_halink[lix].p_lprev = NULL;
369 369 return;
370 370 }
371 371
372 372 mutex_enter(&seg_pmem_mtx);
373 373 lix = seg_pahcur;
374 374 ASSERT(lix >= 0 && lix <= 1);
375 375
376 376 /*
377 377 * If the bucket is on currently active list just remove it from
378 378 * there.
379 379 */
380 380 if (hp->p_halink[lix].p_lnext != NULL) {
381 381 ASSERT(hp->p_halink[lix].p_lprev != NULL);
382 382 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
383 383 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
384 384 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
385 385 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
386 386 hp->p_halink[lix].p_lnext = NULL;
387 387 hp->p_halink[lix].p_lprev = NULL;
388 388 mutex_exit(&seg_pmem_mtx);
389 389 return;
390 390 }
391 391 ASSERT(hp->p_halink[lix].p_lprev == NULL);
392 392
393 393 /*
394 394 * If asynchronous thread is not running we can remove the bucket from
395 395 * not currently active list. The bucket must be on this list since we
396 396 * already checked that it's not on the other list and the bucket from
397 397 * which we just deleted the last pcp entry must be still on one of the
398 398 * active bucket lists.
399 399 */
400 400 lix = !lix;
401 401 ASSERT(hp->p_halink[lix].p_lnext != NULL);
402 402 ASSERT(hp->p_halink[lix].p_lprev != NULL);
403 403
404 404 if (!seg_pathr_on) {
405 405 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
406 406 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
407 407 hp->p_halink[lix].p_lnext = NULL;
408 408 hp->p_halink[lix].p_lprev = NULL;
409 409 }
410 410 mutex_exit(&seg_pmem_mtx);
411 411 }
412 412
413 413 /*
414 414 * Check if bucket pointed by hp already has a pcp entry that matches request
415 415 * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
416 416 * Also delete matching entries that cover smaller address range but start
417 417 * at the same address as addr argument. Return the list of deleted entries if
418 418 * any. This is an internal helper function called from seg_pinsert() only
419 419 * for non wired shadow lists. The caller already holds a per seg/amp list
420 420 * lock.
421 421 */
422 422 static struct seg_pcache *
423 423 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
424 424 caddr_t addr, size_t len, int *found)
425 425 {
426 426 struct seg_pcache *pcp;
427 427 struct seg_pcache *delcallb_list = NULL;
428 428
429 429 ASSERT(MUTEX_HELD(&hp->p_hmutex));
430 430
431 431 *found = 0;
432 432 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
433 433 pcp = pcp->p_hnext) {
434 434 ASSERT(pcp->p_hashp == hp);
435 435 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
436 436 ASSERT(!IS_PCP_WIRED(pcp));
437 437 if (pcp->p_len < len) {
438 438 pcache_link_t *plinkp;
439 439 if (pcp->p_active) {
440 440 continue;
441 441 }
442 442 plinkp = &pcp->p_plink;
443 443 plinkp->p_lprev->p_lnext = plinkp->p_lnext;
444 444 plinkp->p_lnext->p_lprev = plinkp->p_lprev;
445 445 pcp->p_hprev->p_hnext = pcp->p_hnext;
446 446 pcp->p_hnext->p_hprev = pcp->p_hprev;
447 447 pcp->p_hprev = delcallb_list;
448 448 delcallb_list = pcp;
449 449 } else {
450 450 *found = 1;
451 451 break;
452 452 }
453 453 }
454 454 }
455 455 return (delcallb_list);
456 456 }
457 457
458 458 /*
459 459 * lookup an address range in pagelock cache. Return shadow list and bump up
460 460 * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
461 461 * as a lookup tag.
462 462 */
463 463 struct page **
464 464 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
465 465 enum seg_rw rw, uint_t flags)
466 466 {
467 467 struct seg_pcache *pcp;
468 468 struct seg_phash *hp;
469 469 void *htag0;
470 470
471 471 ASSERT(seg != NULL);
472 472 ASSERT(rw == S_READ || rw == S_WRITE);
473 473
474 474 /*
475 475 * Skip pagelock cache, while DR is in progress or
476 476 * seg_pcache is off.
477 477 */
478 478 if (seg_pdisabled) {
479 479 return (NULL);
480 480 }
481 481 ASSERT(seg_phashsize_win != 0);
482 482
483 483 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
484 484 hp = P_HASHBP(seg, htag0, addr, flags);
485 485 mutex_enter(&hp->p_hmutex);
486 486 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
487 487 pcp = pcp->p_hnext) {
488 488 ASSERT(pcp->p_hashp == hp);
489 489 if (P_MATCH(pcp, htag0, addr, len)) {
490 490 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
491 491 /*
492 492 * If this request wants to write pages
493 493 * but write permissions starting from
494 494 * addr don't cover the entire length len
495 495 * return lookup failure back to the caller.
496 496 * It will check protections and fail this
497 497 * pagelock operation with EACCESS error.
498 498 */
499 499 if (rw == S_WRITE && pcp->p_wlen < len) {
500 500 break;
501 501 }
502 502 if (pcp->p_active == UINT_MAX) {
503 503 break;
504 504 }
505 505 pcp->p_active++;
506 506 if (rw == S_WRITE && !pcp->p_write) {
507 507 pcp->p_write = 1;
508 508 }
509 509 mutex_exit(&hp->p_hmutex);
510 510 return (pcp->p_pp);
511 511 }
512 512 }
513 513 mutex_exit(&hp->p_hmutex);
514 514 return (NULL);
515 515 }
516 516
517 517 /*
518 518 * mark address range inactive. If the cache is off or the address range is
519 519 * not in the cache or another shadow list that covers bigger range is found
520 520 * we call the segment driver to reclaim the pages. Otherwise just decrement
521 521 * active count and set ref bit. If amp is not NULL use amp as a lookup tag
522 522 * otherwise use seg as a lookup tag.
523 523 */
524 524 void
525 525 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
526 526 size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
527 527 seg_preclaim_cbfunc_t callback)
528 528 {
529 529 struct seg_pcache *pcp;
530 530 struct seg_phash *hp;
531 531 kmutex_t *pmtx = NULL;
532 532 pcache_link_t *pheadp;
533 533 void *htag0;
534 534 pgcnt_t npages = 0;
535 535 int keep = 0;
536 536
537 537 ASSERT(seg != NULL);
538 538 ASSERT(rw == S_READ || rw == S_WRITE);
539 539
540 540 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
541 541
542 542 /*
543 543 * Skip lookup if pcache is not configured.
544 544 */
545 545 if (seg_phashsize_win == 0) {
546 546 goto out;
547 547 }
548 548
549 549 /*
550 550 * Grab per seg/amp lock before hash lock if we are going to remove
551 551 * inactive entry from pcache.
552 552 */
553 553 if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
554 554 if (amp == NULL) {
555 555 pheadp = &seg->s_phead;
556 556 pmtx = &seg->s_pmtx;
557 557 } else {
558 558 pheadp = &->a_phead;
559 559 pmtx = &->a_pmtx;
560 560 }
561 561 mutex_enter(pmtx);
562 562 }
563 563
564 564 hp = P_HASHBP(seg, htag0, addr, flags);
565 565 mutex_enter(&hp->p_hmutex);
566 566 again:
567 567 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
568 568 pcp = pcp->p_hnext) {
569 569 ASSERT(pcp->p_hashp == hp);
570 570 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
571 571 ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
572 572 ASSERT(pcp->p_active);
573 573 if (keep) {
574 574 /*
575 575 * Don't remove this pcp entry
576 576 * if we didn't find duplicate
577 577 * shadow lists on second search.
578 578 * Somebody removed those duplicates
579 579 * since we dropped hash lock after first
580 580 * search.
581 581 */
582 582 ASSERT(pmtx != NULL);
583 583 ASSERT(!IS_PFLAGS_WIRED(flags));
584 584 mutex_exit(pmtx);
585 585 pmtx = NULL;
586 586 }
587 587 pcp->p_active--;
588 588 if (pcp->p_active == 0 && (pmtx != NULL ||
589 589 (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
590 590
591 591 /*
592 592 * This entry is no longer active. Remove it
593 593 * now either because pcaching is temporarily
594 594 * disabled or there're other pcp entries that
595 595 * can match this pagelock request (i.e. this
596 596 * entry is a duplicate).
597 597 */
598 598
599 599 ASSERT(callback == pcp->p_callback);
600 600 if (pmtx != NULL) {
601 601 pcache_link_t *plinkp = &pcp->p_plink;
602 602 ASSERT(!IS_PCP_WIRED(pcp));
603 603 ASSERT(pheadp->p_lnext != pheadp);
604 604 ASSERT(pheadp->p_lprev != pheadp);
605 605 plinkp->p_lprev->p_lnext =
606 606 plinkp->p_lnext;
607 607 plinkp->p_lnext->p_lprev =
608 608 plinkp->p_lprev;
609 609 }
610 610 pcp->p_hprev->p_hnext = pcp->p_hnext;
611 611 pcp->p_hnext->p_hprev = pcp->p_hprev;
612 612 if (!IS_PCP_WIRED(pcp) &&
613 613 hp->p_hnext == (struct seg_pcache *)hp) {
614 614 /*
615 615 * We removed the last entry from this
616 616 * bucket. Now remove the bucket from
617 617 * its active list.
618 618 */
619 619 seg_premove_abuck(hp, 0);
620 620 }
621 621 mutex_exit(&hp->p_hmutex);
622 622 if (pmtx != NULL) {
623 623 mutex_exit(pmtx);
624 624 }
625 625 len = pcp->p_len;
626 626 npages = btop(len);
627 627 if (rw != S_WRITE && pcp->p_write) {
628 628 rw = S_WRITE;
629 629 }
630 630 kmem_cache_free(seg_pkmcache, pcp);
631 631 goto out;
632 632 } else {
633 633 /*
634 634 * We found a matching pcp entry but will not
635 635 * free it right away even if it's no longer
636 636 * active.
637 637 */
638 638 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
639 639 /*
640 640 * Set the reference bit and mark the
641 641 * time of last access to this pcp
642 642 * so that asynchronous thread doesn't
643 643 * free it immediately since
644 644 * it may be reactivated very soon.
645 645 */
646 646 pcp->p_lbolt = ddi_get_lbolt();
647 647 pcp->p_ref = 1;
648 648 }
649 649 mutex_exit(&hp->p_hmutex);
650 650 if (pmtx != NULL) {
651 651 mutex_exit(pmtx);
652 652 }
653 653 return;
654 654 }
655 655 } else if (!IS_PFLAGS_WIRED(flags) &&
656 656 P_MATCH(pcp, htag0, addr, len)) {
657 657 /*
658 658 * This is a duplicate pcp entry. This situation may
659 659 * happen if a bigger shadow list that covers our
660 660 * range was added while our entry was still active.
661 661 * Now we can free our pcp entry if it becomes
662 662 * inactive.
663 663 */
664 664 if (!pcp->p_active) {
665 665 /*
666 666 * Mark this entry as referenced just in case
667 667 * we'll free our own pcp entry soon.
668 668 */
669 669 pcp->p_lbolt = ddi_get_lbolt();
670 670 pcp->p_ref = 1;
671 671 }
672 672 if (pmtx != NULL) {
673 673 /*
674 674 * we are already holding pmtx and found a
675 675 * duplicate. Don't keep our own pcp entry.
676 676 */
677 677 keep = 0;
678 678 continue;
679 679 }
680 680 /*
681 681 * We have to use mutex_tryenter to attempt to lock
682 682 * seg/amp list lock since we already hold hash lock
683 683 * and seg/amp list lock is above hash lock in lock
684 684 * order. If mutex_tryenter fails drop hash lock and
685 685 * retake both locks in correct order and research
686 686 * this hash chain.
687 687 */
688 688 ASSERT(keep == 0);
689 689 if (amp == NULL) {
690 690 pheadp = &seg->s_phead;
691 691 pmtx = &seg->s_pmtx;
692 692 } else {
693 693 pheadp = &->a_phead;
694 694 pmtx = &->a_pmtx;
695 695 }
696 696 if (!mutex_tryenter(pmtx)) {
697 697 mutex_exit(&hp->p_hmutex);
698 698 mutex_enter(pmtx);
699 699 mutex_enter(&hp->p_hmutex);
700 700 /*
701 701 * If we don't find bigger shadow list on
702 702 * second search (it may happen since we
703 703 * dropped bucket lock) keep the entry that
704 704 * matches our own shadow list.
705 705 */
706 706 keep = 1;
707 707 goto again;
708 708 }
709 709 }
710 710 }
711 711 mutex_exit(&hp->p_hmutex);
712 712 if (pmtx != NULL) {
713 713 mutex_exit(pmtx);
714 714 }
715 715 out:
716 716 (*callback)(htag0, addr, len, pp, rw, 0);
717 717 if (npages) {
718 718 mutex_enter(&seg_pmem_mtx);
719 719 ASSERT(seg_plocked >= npages);
720 720 seg_plocked -= npages;
721 721 if (!IS_PFLAGS_WIRED(flags)) {
722 722 ASSERT(seg_plocked_window >= npages);
723 723 seg_plocked_window -= npages;
724 724 }
725 725 mutex_exit(&seg_pmem_mtx);
726 726 }
727 727
728 728 }
729 729
730 730 #ifdef DEBUG
731 731 static uint32_t p_insert_chk_mtbf = 0;
732 732 #endif
733 733
734 734 /*
735 735 * The seg_pinsert_check() is used by segment drivers to predict whether
736 736 * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
737 737 */
738 738 /*ARGSUSED*/
739 739 int
740 740 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
741 741 size_t len, uint_t flags)
742 742 {
743 743 ASSERT(seg != NULL);
744 744
745 745 #ifdef DEBUG
746 746 if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
747 747 return (SEGP_FAIL);
748 748 }
749 749 #endif
750 750
751 751 if (seg_pdisabled) {
752 752 return (SEGP_FAIL);
753 753 }
754 754 ASSERT(seg_phashsize_win != 0);
755 755
756 756 if (IS_PFLAGS_WIRED(flags)) {
757 757 return (SEGP_SUCCESS);
758 758 }
759 759
760 760 if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
761 761 return (SEGP_FAIL);
762 762 }
763 763
764 764 if (freemem < desfree) {
765 765 return (SEGP_FAIL);
766 766 }
767 767
768 768 return (SEGP_SUCCESS);
769 769 }
770 770
771 771 #ifdef DEBUG
772 772 static uint32_t p_insert_mtbf = 0;
773 773 #endif
774 774
775 775 /*
776 776 * Insert address range with shadow list into pagelock cache if there's no
777 777 * shadow list already cached for this address range. If the cache is off or
778 778 * caching is temporarily disabled or the allowed 'window' is exceeded return
779 779 * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
780 780 *
781 781 * For non wired shadow lists (segvn case) include address in the hashing
782 782 * function to avoid linking all the entries from the same segment or amp on
783 783 * the same bucket. amp is used instead of seg if amp is not NULL. Non wired
784 784 * pcache entries are also linked on a per segment/amp list so that all
785 785 * entries can be found quickly during seg/amp purge without walking the
786 786 * entire pcache hash table. For wired shadow lists (segspt case) we
787 787 * don't use address hashing and per segment linking because the caller
788 788 * currently inserts only one entry per segment that covers the entire
789 789 * segment. If we used per segment linking even for segspt it would complicate
790 790 * seg_ppurge_wiredpp() locking.
791 791 *
792 792 * Both hash bucket and per seg/amp locks need to be held before adding a non
793 793 * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
794 794 * first.
795 795 *
796 796 * This function will also remove from pcache old inactive shadow lists that
797 797 * overlap with this request but cover smaller range for the same start
798 798 * address.
799 799 */
800 800 int
801 801 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
802 802 size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
803 803 seg_preclaim_cbfunc_t callback)
804 804 {
805 805 struct seg_pcache *pcp;
806 806 struct seg_phash *hp;
807 807 pgcnt_t npages;
808 808 pcache_link_t *pheadp;
809 809 kmutex_t *pmtx;
810 810 struct seg_pcache *delcallb_list = NULL;
811 811
812 812 ASSERT(seg != NULL);
813 813 ASSERT(rw == S_READ || rw == S_WRITE);
814 814 ASSERT(rw == S_READ || wlen == len);
815 815 ASSERT(rw == S_WRITE || wlen <= len);
816 816 ASSERT(amp == NULL || wlen == len);
817 817
818 818 #ifdef DEBUG
819 819 if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
820 820 return (SEGP_FAIL);
821 821 }
822 822 #endif
823 823
824 824 if (seg_pdisabled) {
825 825 return (SEGP_FAIL);
826 826 }
827 827 ASSERT(seg_phashsize_win != 0);
828 828
829 829 ASSERT((len & PAGEOFFSET) == 0);
830 830 npages = btop(len);
831 831 mutex_enter(&seg_pmem_mtx);
832 832 if (!IS_PFLAGS_WIRED(flags)) {
833 833 if (seg_plocked_window + npages > seg_pmaxwindow) {
834 834 mutex_exit(&seg_pmem_mtx);
835 835 return (SEGP_FAIL);
836 836 }
837 837 seg_plocked_window += npages;
838 838 }
839 839 seg_plocked += npages;
840 840 mutex_exit(&seg_pmem_mtx);
841 841
842 842 pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
843 843 /*
844 844 * If amp is not NULL set htag0 to amp otherwise set it to seg.
845 845 */
846 846 if (amp == NULL) {
847 847 pcp->p_htag0 = (void *)seg;
848 848 pcp->p_flags = flags & 0xffff;
849 849 } else {
850 850 pcp->p_htag0 = (void *)amp;
851 851 pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
852 852 }
853 853 pcp->p_addr = addr;
854 854 pcp->p_len = len;
855 855 pcp->p_wlen = wlen;
856 856 pcp->p_pp = pp;
857 857 pcp->p_write = (rw == S_WRITE);
858 858 pcp->p_callback = callback;
859 859 pcp->p_active = 1;
860 860
861 861 hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
862 862 if (!IS_PFLAGS_WIRED(flags)) {
863 863 int found;
864 864 void *htag0;
865 865 if (amp == NULL) {
866 866 pheadp = &seg->s_phead;
867 867 pmtx = &seg->s_pmtx;
868 868 htag0 = (void *)seg;
869 869 } else {
870 870 pheadp = &->a_phead;
871 871 pmtx = &->a_pmtx;
872 872 htag0 = (void *)amp;
873 873 }
874 874 mutex_enter(pmtx);
875 875 mutex_enter(&hp->p_hmutex);
876 876 delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
877 877 len, &found);
878 878 if (found) {
879 879 mutex_exit(&hp->p_hmutex);
880 880 mutex_exit(pmtx);
881 881 mutex_enter(&seg_pmem_mtx);
882 882 seg_plocked -= npages;
883 883 seg_plocked_window -= npages;
884 884 mutex_exit(&seg_pmem_mtx);
885 885 kmem_cache_free(seg_pkmcache, pcp);
886 886 goto out;
887 887 }
888 888 pcp->p_plink.p_lnext = pheadp->p_lnext;
889 889 pcp->p_plink.p_lprev = pheadp;
890 890 pheadp->p_lnext->p_lprev = &pcp->p_plink;
891 891 pheadp->p_lnext = &pcp->p_plink;
892 892 } else {
893 893 mutex_enter(&hp->p_hmutex);
894 894 }
895 895 pcp->p_hashp = hp;
896 896 pcp->p_hnext = hp->p_hnext;
897 897 pcp->p_hprev = (struct seg_pcache *)hp;
898 898 hp->p_hnext->p_hprev = pcp;
899 899 hp->p_hnext = pcp;
900 900 if (!IS_PFLAGS_WIRED(flags) &&
901 901 hp->p_hprev == pcp) {
902 902 seg_padd_abuck(hp);
903 903 }
904 904 mutex_exit(&hp->p_hmutex);
905 905 if (!IS_PFLAGS_WIRED(flags)) {
906 906 mutex_exit(pmtx);
907 907 }
908 908
909 909 out:
910 910 npages = 0;
911 911 while (delcallb_list != NULL) {
912 912 pcp = delcallb_list;
913 913 delcallb_list = pcp->p_hprev;
914 914 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
915 915 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
916 916 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
917 917 npages += btop(pcp->p_len);
918 918 kmem_cache_free(seg_pkmcache, pcp);
919 919 }
920 920 if (npages) {
921 921 ASSERT(!IS_PFLAGS_WIRED(flags));
922 922 mutex_enter(&seg_pmem_mtx);
923 923 ASSERT(seg_plocked >= npages);
924 924 ASSERT(seg_plocked_window >= npages);
925 925 seg_plocked -= npages;
926 926 seg_plocked_window -= npages;
927 927 mutex_exit(&seg_pmem_mtx);
928 928 }
929 929
930 930 return (SEGP_SUCCESS);
931 931 }
932 932
933 933 /*
934 934 * purge entries from the pagelock cache if not active
935 935 * and not recently used.
936 936 */
937 937 static void
938 938 seg_ppurge_async(int force)
939 939 {
940 940 struct seg_pcache *delcallb_list = NULL;
941 941 struct seg_pcache *pcp;
942 942 struct seg_phash *hp;
943 943 pgcnt_t npages = 0;
944 944 pgcnt_t npages_window = 0;
945 945 pgcnt_t npgs_to_purge;
946 946 pgcnt_t npgs_purged = 0;
947 947 int hlinks = 0;
948 948 int hlix;
949 949 pcache_link_t *hlinkp;
950 950 pcache_link_t *hlnextp = NULL;
951 951 int lowmem;
952 952 int trim;
953 953
954 954 ASSERT(seg_phashsize_win != 0);
955 955
956 956 /*
957 957 * if the cache is off or empty, return
958 958 */
959 959 if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
960 960 return;
961 961 }
962 962
963 963 if (!force) {
964 964 lowmem = 0;
965 965 trim = 0;
966 966 if (freemem < lotsfree + needfree) {
967 967 spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
968 968 if (fmem <= 5 * (desfree >> 2)) {
969 969 lowmem = 1;
970 970 } else if (fmem <= 7 * (lotsfree >> 3)) {
971 971 if (seg_plocked_window >=
972 972 (availrmem_initial >> 1)) {
973 973 lowmem = 1;
974 974 }
975 975 } else if (fmem < lotsfree) {
976 976 if (seg_plocked_window >=
977 977 3 * (availrmem_initial >> 2)) {
978 978 lowmem = 1;
979 979 }
980 980 }
981 981 }
982 982 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
983 983 trim = 1;
984 984 }
985 985 if (!lowmem && !trim) {
986 986 return;
987 987 }
988 988 npgs_to_purge = seg_plocked_window >>
989 989 seg_pshrink_shift;
990 990 if (lowmem) {
991 991 npgs_to_purge = MIN(npgs_to_purge,
992 992 MAX(seg_pmaxapurge_npages, desfree));
993 993 } else {
994 994 npgs_to_purge = MIN(npgs_to_purge,
995 995 seg_pmaxapurge_npages);
996 996 }
997 997 if (npgs_to_purge == 0) {
998 998 return;
999 999 }
1000 1000 } else {
1001 1001 struct seg_phash_wired *hpw;
1002 1002
1003 1003 ASSERT(seg_phashsize_wired != 0);
1004 1004
1005 1005 for (hpw = seg_phashtab_wired;
1006 1006 hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1007 1007
1008 1008 if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1009 1009 continue;
1010 1010 }
1011 1011
1012 1012 mutex_enter(&hpw->p_hmutex);
1013 1013
1014 1014 for (pcp = hpw->p_hnext;
1015 1015 pcp != (struct seg_pcache *)hpw;
1016 1016 pcp = pcp->p_hnext) {
1017 1017
1018 1018 ASSERT(IS_PCP_WIRED(pcp));
1019 1019 ASSERT(pcp->p_hashp ==
1020 1020 (struct seg_phash *)hpw);
1021 1021
1022 1022 if (pcp->p_active) {
1023 1023 continue;
1024 1024 }
1025 1025 pcp->p_hprev->p_hnext = pcp->p_hnext;
1026 1026 pcp->p_hnext->p_hprev = pcp->p_hprev;
1027 1027 pcp->p_hprev = delcallb_list;
1028 1028 delcallb_list = pcp;
1029 1029 }
1030 1030 mutex_exit(&hpw->p_hmutex);
1031 1031 }
1032 1032 }
1033 1033
1034 1034 mutex_enter(&seg_pmem_mtx);
1035 1035 if (seg_pathr_on) {
1036 1036 mutex_exit(&seg_pmem_mtx);
1037 1037 goto runcb;
1038 1038 }
1039 1039 seg_pathr_on = 1;
1040 1040 mutex_exit(&seg_pmem_mtx);
1041 1041 ASSERT(seg_pahcur <= 1);
1042 1042 hlix = !seg_pahcur;
1043 1043
1044 1044 again:
1045 1045 for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1046 1046 hlinkp = hlnextp) {
1047 1047
1048 1048 hlnextp = hlinkp->p_lnext;
1049 1049 ASSERT(hlnextp != NULL);
1050 1050
1051 1051 hp = hlink2phash(hlinkp, hlix);
1052 1052 if (hp->p_hnext == (struct seg_pcache *)hp) {
1053 1053 seg_pathr_empty_ahb++;
1054 1054 continue;
1055 1055 }
1056 1056 seg_pathr_full_ahb++;
1057 1057 mutex_enter(&hp->p_hmutex);
1058 1058
1059 1059 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1060 1060 pcp = pcp->p_hnext) {
1061 1061 pcache_link_t *pheadp;
1062 1062 pcache_link_t *plinkp;
1063 1063 void *htag0;
1064 1064 kmutex_t *pmtx;
1065 1065
1066 1066 ASSERT(!IS_PCP_WIRED(pcp));
1067 1067 ASSERT(pcp->p_hashp == hp);
1068 1068
1069 1069 if (pcp->p_active) {
1070 1070 continue;
1071 1071 }
1072 1072 if (!force && pcp->p_ref &&
1073 1073 PCP_AGE(pcp) < seg_pmax_pcpage) {
1074 1074 pcp->p_ref = 0;
1075 1075 continue;
1076 1076 }
1077 1077 plinkp = &pcp->p_plink;
1078 1078 htag0 = pcp->p_htag0;
1079 1079 if (pcp->p_flags & SEGP_AMP) {
1080 1080 pheadp = &((amp_t *)htag0)->a_phead;
1081 1081 pmtx = &((amp_t *)htag0)->a_pmtx;
1082 1082 } else {
1083 1083 pheadp = &((seg_t *)htag0)->s_phead;
1084 1084 pmtx = &((seg_t *)htag0)->s_pmtx;
1085 1085 }
1086 1086 if (!mutex_tryenter(pmtx)) {
1087 1087 continue;
1088 1088 }
1089 1089 ASSERT(pheadp->p_lnext != pheadp);
1090 1090 ASSERT(pheadp->p_lprev != pheadp);
1091 1091 plinkp->p_lprev->p_lnext =
1092 1092 plinkp->p_lnext;
1093 1093 plinkp->p_lnext->p_lprev =
1094 1094 plinkp->p_lprev;
1095 1095 pcp->p_hprev->p_hnext = pcp->p_hnext;
1096 1096 pcp->p_hnext->p_hprev = pcp->p_hprev;
1097 1097 mutex_exit(pmtx);
1098 1098 pcp->p_hprev = delcallb_list;
1099 1099 delcallb_list = pcp;
1100 1100 npgs_purged += btop(pcp->p_len);
1101 1101 }
1102 1102 if (hp->p_hnext == (struct seg_pcache *)hp) {
1103 1103 seg_premove_abuck(hp, 1);
1104 1104 }
1105 1105 mutex_exit(&hp->p_hmutex);
1106 1106 if (npgs_purged >= seg_plocked_window) {
1107 1107 break;
1108 1108 }
1109 1109 if (!force) {
1110 1110 if (npgs_purged >= npgs_to_purge) {
1111 1111 break;
1112 1112 }
1113 1113 if (!trim && !(seg_pathr_full_ahb & 15)) {
1114 1114 ASSERT(lowmem);
1115 1115 if (freemem >= lotsfree + needfree) {
1116 1116 break;
1117 1117 }
1118 1118 }
1119 1119 }
1120 1120 }
1121 1121
1122 1122 if (hlinkp == &seg_pahhead[hlix]) {
1123 1123 /*
1124 1124 * We processed the entire hlix active bucket list
1125 1125 * but didn't find enough pages to reclaim.
1126 1126 * Switch the lists and walk the other list
1127 1127 * if we haven't done it yet.
1128 1128 */
1129 1129 mutex_enter(&seg_pmem_mtx);
1130 1130 ASSERT(seg_pathr_on);
1131 1131 ASSERT(seg_pahcur == !hlix);
1132 1132 seg_pahcur = hlix;
1133 1133 mutex_exit(&seg_pmem_mtx);
1134 1134 if (++hlinks < 2) {
1135 1135 hlix = !hlix;
1136 1136 goto again;
1137 1137 }
1138 1138 } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1139 1139 seg_pahhead[hlix].p_lnext != hlinkp) {
1140 1140 ASSERT(hlinkp != NULL);
1141 1141 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1142 1142 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1143 1143 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1144 1144
1145 1145 /*
1146 1146 * Reinsert the header to point to hlinkp
1147 1147 * so that we start from hlinkp bucket next time around.
1148 1148 */
1149 1149 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1150 1150 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1151 1151 seg_pahhead[hlix].p_lnext = hlinkp;
1152 1152 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1153 1153 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1154 1154 hlinkp->p_lprev = &seg_pahhead[hlix];
1155 1155 }
1156 1156
1157 1157 mutex_enter(&seg_pmem_mtx);
1158 1158 ASSERT(seg_pathr_on);
1159 1159 seg_pathr_on = 0;
1160 1160 mutex_exit(&seg_pmem_mtx);
1161 1161
1162 1162 runcb:
1163 1163 /*
1164 1164 * Run the delayed callback list. segments/amps can't go away until
1165 1165 * callback is executed since they must have non 0 softlockcnt. That's
1166 1166 * why we don't need to hold as/seg/amp locks to execute the callback.
1167 1167 */
1168 1168 while (delcallb_list != NULL) {
1169 1169 pcp = delcallb_list;
1170 1170 delcallb_list = pcp->p_hprev;
1171 1171 ASSERT(!pcp->p_active);
1172 1172 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1173 1173 pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1174 1174 npages += btop(pcp->p_len);
1175 1175 if (!IS_PCP_WIRED(pcp)) {
1176 1176 npages_window += btop(pcp->p_len);
1177 1177 }
1178 1178 kmem_cache_free(seg_pkmcache, pcp);
1179 1179 }
1180 1180 if (npages) {
1181 1181 mutex_enter(&seg_pmem_mtx);
1182 1182 ASSERT(seg_plocked >= npages);
1183 1183 ASSERT(seg_plocked_window >= npages_window);
1184 1184 seg_plocked -= npages;
1185 1185 seg_plocked_window -= npages_window;
1186 1186 mutex_exit(&seg_pmem_mtx);
1187 1187 }
1188 1188 }
1189 1189
1190 1190 /*
1191 1191 * Remove cached pages for segment(s) entries from hashtable. The segments
1192 1192 * are identified by pp array. This is useful for multiple seg's cached on
1193 1193 * behalf of dummy segment (ISM/DISM) with common pp array.
1194 1194 */
1195 1195 void
1196 1196 seg_ppurge_wiredpp(struct page **pp)
1197 1197 {
1198 1198 struct seg_pcache *pcp;
1199 1199 struct seg_phash_wired *hp;
1200 1200 pgcnt_t npages = 0;
1201 1201 struct seg_pcache *delcallb_list = NULL;
1202 1202
1203 1203 /*
1204 1204 * if the cache is empty, return
1205 1205 */
1206 1206 if (seg_plocked == 0) {
1207 1207 return;
1208 1208 }
1209 1209 ASSERT(seg_phashsize_wired != 0);
1210 1210
1211 1211 for (hp = seg_phashtab_wired;
1212 1212 hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1213 1213 if (hp->p_hnext == (struct seg_pcache *)hp) {
1214 1214 continue;
1215 1215 }
1216 1216 mutex_enter(&hp->p_hmutex);
1217 1217 pcp = hp->p_hnext;
1218 1218 while (pcp != (struct seg_pcache *)hp) {
1219 1219 ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1220 1220 ASSERT(IS_PCP_WIRED(pcp));
1221 1221 /*
1222 1222 * purge entries which are not active
1223 1223 */
1224 1224 if (!pcp->p_active && pcp->p_pp == pp) {
1225 1225 ASSERT(pcp->p_htag0 != NULL);
1226 1226 pcp->p_hprev->p_hnext = pcp->p_hnext;
1227 1227 pcp->p_hnext->p_hprev = pcp->p_hprev;
1228 1228 pcp->p_hprev = delcallb_list;
1229 1229 delcallb_list = pcp;
1230 1230 }
1231 1231 pcp = pcp->p_hnext;
1232 1232 }
1233 1233 mutex_exit(&hp->p_hmutex);
1234 1234 /*
1235 1235 * segments can't go away until callback is executed since
1236 1236 * they must have non 0 softlockcnt. That's why we don't
1237 1237 * need to hold as/seg locks to execute the callback.
1238 1238 */
1239 1239 while (delcallb_list != NULL) {
1240 1240 int done;
1241 1241 pcp = delcallb_list;
1242 1242 delcallb_list = pcp->p_hprev;
1243 1243 ASSERT(!pcp->p_active);
1244 1244 done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1245 1245 pcp->p_len, pcp->p_pp,
1246 1246 pcp->p_write ? S_WRITE : S_READ, 1);
1247 1247 npages += btop(pcp->p_len);
1248 1248 ASSERT(IS_PCP_WIRED(pcp));
1249 1249 kmem_cache_free(seg_pkmcache, pcp);
1250 1250 if (done) {
1251 1251 ASSERT(delcallb_list == NULL);
1252 1252 goto out;
1253 1253 }
1254 1254 }
1255 1255 }
1256 1256
1257 1257 out:
1258 1258 mutex_enter(&seg_pmem_mtx);
1259 1259 ASSERT(seg_plocked >= npages);
1260 1260 seg_plocked -= npages;
1261 1261 mutex_exit(&seg_pmem_mtx);
1262 1262 }
1263 1263
1264 1264 /*
1265 1265 * purge all entries for a given segment. Since we
1266 1266 * callback into the segment driver directly for page
1267 1267 * reclaim the caller needs to hold the right locks.
1268 1268 */
1269 1269 void
1270 1270 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1271 1271 {
1272 1272 struct seg_pcache *delcallb_list = NULL;
1273 1273 struct seg_pcache *pcp;
1274 1274 struct seg_phash *hp;
1275 1275 pgcnt_t npages = 0;
1276 1276 void *htag0;
1277 1277
1278 1278 if (seg_plocked == 0) {
1279 1279 return;
1280 1280 }
1281 1281 ASSERT(seg_phashsize_win != 0);
1282 1282
1283 1283 /*
1284 1284 * If amp is not NULL use amp as a lookup tag otherwise use seg
1285 1285 * as a lookup tag.
1286 1286 */
1287 1287 htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1288 1288 ASSERT(htag0 != NULL);
1289 1289 if (IS_PFLAGS_WIRED(flags)) {
1290 1290 hp = P_HASHBP(seg, htag0, 0, flags);
1291 1291 mutex_enter(&hp->p_hmutex);
1292 1292 pcp = hp->p_hnext;
1293 1293 while (pcp != (struct seg_pcache *)hp) {
1294 1294 ASSERT(pcp->p_hashp == hp);
1295 1295 ASSERT(IS_PCP_WIRED(pcp));
1296 1296 if (pcp->p_htag0 == htag0) {
1297 1297 if (pcp->p_active) {
1298 1298 break;
1299 1299 }
1300 1300 pcp->p_hprev->p_hnext = pcp->p_hnext;
1301 1301 pcp->p_hnext->p_hprev = pcp->p_hprev;
1302 1302 pcp->p_hprev = delcallb_list;
1303 1303 delcallb_list = pcp;
1304 1304 }
1305 1305 pcp = pcp->p_hnext;
1306 1306 }
1307 1307 mutex_exit(&hp->p_hmutex);
1308 1308 } else {
1309 1309 pcache_link_t *plinkp;
1310 1310 pcache_link_t *pheadp;
1311 1311 kmutex_t *pmtx;
1312 1312
1313 1313 if (amp == NULL) {
1314 1314 ASSERT(seg != NULL);
1315 1315 pheadp = &seg->s_phead;
1316 1316 pmtx = &seg->s_pmtx;
1317 1317 } else {
1318 1318 pheadp = &->a_phead;
1319 1319 pmtx = &->a_pmtx;
1320 1320 }
1321 1321 mutex_enter(pmtx);
1322 1322 while ((plinkp = pheadp->p_lnext) != pheadp) {
1323 1323 pcp = plink2pcache(plinkp);
1324 1324 ASSERT(!IS_PCP_WIRED(pcp));
1325 1325 ASSERT(pcp->p_htag0 == htag0);
1326 1326 hp = pcp->p_hashp;
1327 1327 mutex_enter(&hp->p_hmutex);
1328 1328 if (pcp->p_active) {
1329 1329 mutex_exit(&hp->p_hmutex);
1330 1330 break;
1331 1331 }
1332 1332 ASSERT(plinkp->p_lprev == pheadp);
1333 1333 pheadp->p_lnext = plinkp->p_lnext;
1334 1334 plinkp->p_lnext->p_lprev = pheadp;
1335 1335 pcp->p_hprev->p_hnext = pcp->p_hnext;
1336 1336 pcp->p_hnext->p_hprev = pcp->p_hprev;
1337 1337 pcp->p_hprev = delcallb_list;
1338 1338 delcallb_list = pcp;
1339 1339 if (hp->p_hnext == (struct seg_pcache *)hp) {
1340 1340 seg_premove_abuck(hp, 0);
1341 1341 }
1342 1342 mutex_exit(&hp->p_hmutex);
1343 1343 }
1344 1344 mutex_exit(pmtx);
1345 1345 }
1346 1346 while (delcallb_list != NULL) {
1347 1347 pcp = delcallb_list;
1348 1348 delcallb_list = pcp->p_hprev;
1349 1349 ASSERT(!pcp->p_active);
1350 1350 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1351 1351 pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1352 1352 npages += btop(pcp->p_len);
1353 1353 kmem_cache_free(seg_pkmcache, pcp);
1354 1354 }
1355 1355 mutex_enter(&seg_pmem_mtx);
1356 1356 ASSERT(seg_plocked >= npages);
1357 1357 seg_plocked -= npages;
1358 1358 if (!IS_PFLAGS_WIRED(flags)) {
1359 1359 ASSERT(seg_plocked_window >= npages);
1360 1360 seg_plocked_window -= npages;
1361 1361 }
1362 1362 mutex_exit(&seg_pmem_mtx);
1363 1363 }
1364 1364
1365 1365 static void seg_pinit_mem_config(void);
1366 1366
1367 1367 /*
1368 1368 * setup the pagelock cache
1369 1369 */
1370 1370 static void
1371 1371 seg_pinit(void)
1372 1372 {
1373 1373 struct seg_phash *hp;
1374 1374 ulong_t i;
1375 1375 pgcnt_t physmegs;
1376 1376
1377 1377 seg_plocked = 0;
1378 1378 seg_plocked_window = 0;
1379 1379
1380 1380 if (segpcache_enabled == 0) {
1381 1381 seg_phashsize_win = 0;
1382 1382 seg_phashsize_wired = 0;
1383 1383 seg_pdisabled = 1;
1384 1384 return;
1385 1385 }
1386 1386
1387 1387 seg_pdisabled = 0;
1388 1388 seg_pkmcache = kmem_cache_create("seg_pcache",
1389 1389 sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1390 1390 if (segpcache_pcp_maxage_ticks <= 0) {
1391 1391 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1392 1392 }
1393 1393 seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1394 1394 seg_pathr_empty_ahb = 0;
1395 1395 seg_pathr_full_ahb = 0;
1396 1396 seg_pshrink_shift = segpcache_shrink_shift;
1397 1397 seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1398 1398
1399 1399 mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 1400 mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 1401 mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1402 1402 cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1403 1403
1404 1404 physmegs = physmem >> (20 - PAGESHIFT);
1405 1405
1406 1406 /*
1407 1407 * If segpcache_hashsize_win was not set in /etc/system or it has
1408 1408 * absurd value set it to a default.
1409 1409 */
1410 1410 if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1411 1411 /*
1412 1412 * Create one bucket per 32K (or at least per 8 pages) of
1413 1413 * available memory.
1414 1414 */
1415 1415 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1416 1416 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1417 1417 }
1418 1418 if (!ISP2(segpcache_hashsize_win)) {
1419 1419 ulong_t rndfac = ~(1UL <<
1420 1420 (highbit(segpcache_hashsize_win) - 1));
1421 1421 rndfac &= segpcache_hashsize_win;
1422 1422 segpcache_hashsize_win += rndfac;
1423 1423 segpcache_hashsize_win = 1 <<
1424 1424 (highbit(segpcache_hashsize_win) - 1);
1425 1425 }
1426 1426 seg_phashsize_win = segpcache_hashsize_win;
1427 1427 seg_phashtab_win = kmem_zalloc(
1428 1428 seg_phashsize_win * sizeof (struct seg_phash),
1429 1429 KM_SLEEP);
1430 1430 for (i = 0; i < seg_phashsize_win; i++) {
1431 1431 hp = &seg_phashtab_win[i];
1432 1432 hp->p_hnext = (struct seg_pcache *)hp;
1433 1433 hp->p_hprev = (struct seg_pcache *)hp;
1434 1434 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1435 1435 }
1436 1436
1437 1437 seg_pahcur = 0;
1438 1438 seg_pathr_on = 0;
1439 1439 seg_pahhead[0].p_lnext = &seg_pahhead[0];
1440 1440 seg_pahhead[0].p_lprev = &seg_pahhead[0];
1441 1441 seg_pahhead[1].p_lnext = &seg_pahhead[1];
1442 1442 seg_pahhead[1].p_lprev = &seg_pahhead[1];
1443 1443
1444 1444 /*
1445 1445 * If segpcache_hashsize_wired was not set in /etc/system or it has
1446 1446 * absurd value set it to a default.
1447 1447 */
1448 1448 if (segpcache_hashsize_wired == 0 ||
1449 1449 segpcache_hashsize_wired > physmem / 4) {
1450 1450 /*
1451 1451 * Choose segpcache_hashsize_wired based on physmem.
1452 1452 * Create a bucket per 128K bytes upto 256K buckets.
1453 1453 */
1454 1454 if (physmegs < 20 * 1024) {
1455 1455 segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1456 1456 } else {
1457 1457 segpcache_hashsize_wired = 256 * 1024;
1458 1458 }
1459 1459 }
1460 1460 if (!ISP2(segpcache_hashsize_wired)) {
1461 1461 segpcache_hashsize_wired = 1 <<
1462 1462 highbit(segpcache_hashsize_wired);
1463 1463 }
1464 1464 seg_phashsize_wired = segpcache_hashsize_wired;
1465 1465 seg_phashtab_wired = kmem_zalloc(
1466 1466 seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1467 1467 for (i = 0; i < seg_phashsize_wired; i++) {
1468 1468 hp = (struct seg_phash *)&seg_phashtab_wired[i];
1469 1469 hp->p_hnext = (struct seg_pcache *)hp;
1470 1470 hp->p_hprev = (struct seg_pcache *)hp;
1471 1471 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1472 1472 }
1473 1473
1474 1474 if (segpcache_maxwindow == 0) {
1475 1475 if (physmegs < 64) {
1476 1476 /* 3% of memory */
1477 1477 segpcache_maxwindow = availrmem >> 5;
1478 1478 } else if (physmegs < 512) {
1479 1479 /* 12% of memory */
1480 1480 segpcache_maxwindow = availrmem >> 3;
1481 1481 } else if (physmegs < 1024) {
1482 1482 /* 25% of memory */
1483 1483 segpcache_maxwindow = availrmem >> 2;
1484 1484 } else if (physmegs < 2048) {
1485 1485 /* 50% of memory */
1486 1486 segpcache_maxwindow = availrmem >> 1;
1487 1487 } else {
1488 1488 /* no limit */
1489 1489 segpcache_maxwindow = (pgcnt_t)-1;
1490 1490 }
1491 1491 }
1492 1492 seg_pmaxwindow = segpcache_maxwindow;
1493 1493 seg_pinit_mem_config();
1494 1494 }
1495 1495
1496 1496 /*
1497 1497 * called by pageout if memory is low
1498 1498 */
1499 1499 void
1500 1500 seg_preap(void)
1501 1501 {
1502 1502 /*
1503 1503 * if the cache is off or empty, return
1504 1504 */
1505 1505 if (seg_plocked_window == 0) {
1506 1506 return;
1507 1507 }
1508 1508 ASSERT(seg_phashsize_win != 0);
1509 1509
1510 1510 /*
1511 1511 * If somebody is already purging pcache
1512 1512 * just return.
1513 1513 */
1514 1514 if (seg_pdisabled) {
1515 1515 return;
1516 1516 }
1517 1517
1518 1518 cv_signal(&seg_pasync_cv);
1519 1519 }
1520 1520
1521 1521 /*
1522 1522 * run as a backgroud thread and reclaim pagelock
1523 1523 * pages which have not been used recently
1524 1524 */
1525 1525 void
1526 1526 seg_pasync_thread(void)
1527 1527 {
1528 1528 callb_cpr_t cpr_info;
1529 1529
1530 1530 if (seg_phashsize_win == 0) {
1531 1531 thread_exit();
1532 1532 /*NOTREACHED*/
1533 1533 }
1534 1534
1535 1535 seg_pasync_thr = curthread;
1536 1536
1537 1537 CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1538 1538 callb_generic_cpr, "seg_pasync");
1539 1539
1540 1540 if (segpcache_reap_ticks <= 0) {
1541 1541 segpcache_reap_ticks = segpcache_reap_sec * hz;
1542 1542 }
1543 1543
1544 1544 mutex_enter(&seg_pasync_mtx);
1545 1545 for (;;) {
1546 1546 CALLB_CPR_SAFE_BEGIN(&cpr_info);
1547 1547 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1548 1548 segpcache_reap_ticks, TR_CLOCK_TICK);
1549 1549 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1550 1550 if (seg_pdisabled == 0) {
1551 1551 seg_ppurge_async(0);
1552 1552 }
1553 1553 }
1554 1554 }
1555 1555
1556 1556 static struct kmem_cache *seg_cache;
1557 1557
1558 1558 /*
1559 1559 * Initialize segment management data structures.
1560 1560 */
1561 1561 void
1562 1562 seg_init(void)
1563 1563 {
1564 1564 kstat_t *ksp;
1565 1565
1566 1566 seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1567 1567 0, NULL, NULL, NULL, NULL, NULL, 0);
1568 1568
1569 1569 ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1570 1570 segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1571 1571 if (ksp) {
1572 1572 ksp->ks_data = (void *)segadvstat_ptr;
1573 1573 kstat_install(ksp);
1574 1574 }
1575 1575
1576 1576 seg_pinit();
1577 1577 }
1578 1578
1579 1579 /*
1580 1580 * Allocate a segment to cover [base, base+size]
1581 1581 * and attach it to the specified address space.
1582 1582 */
1583 1583 struct seg *
1584 1584 seg_alloc(struct as *as, caddr_t base, size_t size)
1585 1585 {
1586 1586 struct seg *new;
1587 1587 caddr_t segbase;
1588 1588 size_t segsize;
1589 1589
1590 1590 segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1591 1591 segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1592 1592 (uintptr_t)segbase;
1593 1593
1594 1594 if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1595 1595 return ((struct seg *)NULL); /* bad virtual addr range */
1596 1596
1597 1597 if (as != &kas &&
1598 1598 valid_usr_range(segbase, segsize, 0, as,
1599 1599 as->a_userlimit) != RANGE_OKAY)
1600 1600 return ((struct seg *)NULL); /* bad virtual addr range */
1601 1601
1602 1602 new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1603 1603 new->s_ops = NULL;
1604 1604 new->s_data = NULL;
1605 1605 new->s_szc = 0;
1606 1606 new->s_flags = 0;
1607 1607 mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1608 1608 new->s_phead.p_lnext = &new->s_phead;
1609 1609 new->s_phead.p_lprev = &new->s_phead;
1610 1610 if (seg_attach(as, segbase, segsize, new) < 0) {
1611 1611 kmem_cache_free(seg_cache, new);
1612 1612 return ((struct seg *)NULL);
1613 1613 }
1614 1614 /* caller must fill in ops, data */
1615 1615 return (new);
1616 1616 }
1617 1617
1618 1618 /*
1619 1619 * Attach a segment to the address space. Used by seg_alloc()
1620 1620 * and for kernel startup to attach to static segments.
1621 1621 */
1622 1622 int
1623 1623 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1624 1624 {
1625 1625 seg->s_as = as;
1626 1626 seg->s_base = base;
1627 1627 seg->s_size = size;
1628 1628
1629 1629 /*
1630 1630 * as_addseg() will add the segment at the appropraite point
1631 1631 * in the list. It will return -1 if there is overlap with
1632 1632 * an already existing segment.
1633 1633 */
1634 1634 return (as_addseg(as, seg));
1635 1635 }
1636 1636
1637 1637 /*
1638 1638 * Unmap a segment and free it from its associated address space.
1639 1639 * This should be called by anybody who's finished with a whole segment's
1640 1640 * mapping. Just calls segop_unmap() on the whole mapping . It is the
1641 1641 * responsibility of the segment driver to unlink the the segment
1642 1642 * from the address space, and to free public and private data structures
1643 1643 * associated with the segment. (This is typically done by a call to
1644 1644 * seg_free()).
1645 1645 */
1646 1646 void
1647 1647 seg_unmap(struct seg *seg)
1648 1648 {
1649 1649 #ifdef DEBUG
1650 1650 int ret;
1651 1651 #endif /* DEBUG */
1652 1652
1653 1653 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1654 1654
1655 1655 /* Shouldn't have called seg_unmap if mapping isn't yet established */
1656 1656 ASSERT(seg->s_data != NULL);
1657 1657
1658 1658 /* Unmap the whole mapping */
1659 1659 #ifdef DEBUG
1660 1660 ret = segop_unmap(seg, seg->s_base, seg->s_size);
1661 1661 ASSERT(ret == 0);
1662 1662 #else
1663 1663 (void) segop_unmap(seg, seg->s_base, seg->s_size);
1664 1664 #endif /* DEBUG */
1665 1665 }
1666 1666
1667 1667 /*
1668 1668 * Free the segment from its associated as. This should only be called
1669 1669 * if a mapping to the segment has not yet been established (e.g., if
1670 1670 * an error occurs in the middle of doing an as_map when the segment
1671 1671 * has already been partially set up) or if it has already been deleted
1672 1672 * (e.g., from a segment driver unmap routine if the unmap applies to the
1673 1673 * entire segment). If the mapping is currently set up then seg_unmap() should
1674 1674 * be called instead.
1675 1675 */
1676 1676 void
1677 1677 seg_free(struct seg *seg)
1678 1678 {
1679 1679 register struct as *as = seg->s_as;
1680 1680 struct seg *tseg = as_removeseg(as, seg);
1681 1681
1682 1682 ASSERT(tseg == seg);
1683 1683
1684 1684 /*
1685 1685 * If the segment private data field is NULL,
1686 1686 * then segment driver is not attached yet.
1687 1687 */
1688 1688 if (seg->s_data != NULL)
1689 1689 segop_free(seg);
1690 1690
1691 1691 mutex_destroy(&seg->s_pmtx);
1692 1692 ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1693 1693 ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1694 1694 kmem_cache_free(seg_cache, seg);
1695 1695 }
1696 1696
1697 1697 /*ARGSUSED*/
1698 1698 static void
1699 1699 seg_p_mem_config_post_add(
1700 1700 void *arg,
1701 1701 pgcnt_t delta_pages)
1702 1702 {
1703 1703 /* Nothing to do. */
1704 1704 }
1705 1705
1706 1706 void
1707 1707 seg_p_enable(void)
1708 1708 {
1709 1709 mutex_enter(&seg_pcache_mtx);
1710 1710 ASSERT(seg_pdisabled != 0);
1711 1711 seg_pdisabled--;
1712 1712 mutex_exit(&seg_pcache_mtx);
1713 1713 }
1714 1714
1715 1715 /*
1716 1716 * seg_p_disable - disables seg_pcache, and then attempts to empty the
1717 1717 * cache.
1718 1718 * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1719 1719 * SEGP_FAIL if the cache could not be emptied.
1720 1720 */
1721 1721 int
1722 1722 seg_p_disable(void)
1723 1723 {
1724 1724 pgcnt_t old_plocked;
1725 1725 int stall_count = 0;
1726 1726
1727 1727 mutex_enter(&seg_pcache_mtx);
1728 1728 seg_pdisabled++;
1729 1729 ASSERT(seg_pdisabled != 0);
1730 1730 mutex_exit(&seg_pcache_mtx);
1731 1731
1732 1732 /*
1733 1733 * Attempt to empty the cache. Terminate if seg_plocked does not
1734 1734 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1735 1735 */
1736 1736 while (seg_plocked != 0) {
1737 1737 ASSERT(seg_phashsize_win != 0);
1738 1738 old_plocked = seg_plocked;
1739 1739 seg_ppurge_async(1);
1740 1740 if (seg_plocked == old_plocked) {
1741 1741 if (stall_count++ > SEGP_STALL_THRESHOLD) {
1742 1742 return (SEGP_FAIL);
1743 1743 }
1744 1744 } else
1745 1745 stall_count = 0;
1746 1746 if (seg_plocked != 0)
1747 1747 delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1748 1748 }
1749 1749 return (SEGP_SUCCESS);
1750 1750 }
1751 1751
1752 1752 /*
1753 1753 * Attempt to purge seg_pcache. May need to return before this has
1754 1754 * completed to allow other pre_del callbacks to unlock pages. This is
1755 1755 * ok because:
1756 1756 * 1) The seg_pdisabled flag has been set so at least we won't
1757 1757 * cache anymore locks and the locks we couldn't purge
1758 1758 * will not be held if they do get released by a subsequent
1759 1759 * pre-delete callback.
1760 1760 *
1761 1761 * 2) The rest of the memory delete thread processing does not
1762 1762 * depend on the changes made in this pre-delete callback. No
1763 1763 * panics will result, the worst that will happen is that the
1764 1764 * DR code will timeout and cancel the delete.
1765 1765 */
1766 1766 /*ARGSUSED*/
1767 1767 static int
1768 1768 seg_p_mem_config_pre_del(
1769 1769 void *arg,
1770 1770 pgcnt_t delta_pages)
1771 1771 {
1772 1772 if (seg_phashsize_win == 0) {
1773 1773 return (0);
1774 1774 }
1775 1775 if (seg_p_disable() != SEGP_SUCCESS)
1776 1776 cmn_err(CE_NOTE,
1777 1777 "!Pre-delete couldn't purge"" pagelock cache - continuing");
1778 1778 return (0);
1779 1779 }
1780 1780
1781 1781 /*ARGSUSED*/
1782 1782 static void
1783 1783 seg_p_mem_config_post_del(
1784 1784 void *arg,
1785 1785 pgcnt_t delta_pages,
1786 1786 int cancelled)
1787 1787 {
1788 1788 if (seg_phashsize_win == 0) {
1789 1789 return;
1790 1790 }
1791 1791 seg_p_enable();
1792 1792 }
1793 1793
1794 1794 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1795 1795 KPHYSM_SETUP_VECTOR_VERSION,
1796 1796 seg_p_mem_config_post_add,
1797 1797 seg_p_mem_config_pre_del,
1798 1798 seg_p_mem_config_post_del,
1799 1799 };
1800 1800
1801 1801 static void
1802 1802 seg_pinit_mem_config(void)
1803 1803 {
1804 1804 int ret;
1805 1805
1806 1806 ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1807 1807 /*
1808 1808 * Want to catch this in the debug kernel. At run time, if the
1809 1809 * callbacks don't get run all will be OK as the disable just makes
1810 1810 * it more likely that the pages can be collected.
1811 1811 */
1812 1812 ASSERT(ret == 0);
1813 1813 }
1814 1814
1815 1815 /*
1816 1816 * Verify that segment is not a shared anonymous segment which reserves
1817 1817 * swap. zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1818 1818 * from one zone to another if any segments are shared. This is because the
1819 1819 * last process to exit will credit the swap reservation. This could lead
1820 1820 * to the swap being reserved by one zone, and credited to another.
1821 1821 */
1822 1822 boolean_t
1823 1823 seg_can_change_zones(struct seg *seg)
1824 1824 {
1825 1825 struct segvn_data *svd;
1826 1826
1827 1827 if (seg->s_ops == &segspt_shmops)
1828 1828 return (B_FALSE);
1829 1829
1830 1830 if (seg->s_ops == &segvn_ops) {
1831 1831 svd = (struct segvn_data *)seg->s_data;
1832 1832 if (svd->type == MAP_SHARED &&
1833 1833 svd->amp != NULL &&
1834 1834 svd->amp->swresv > 0)
1835 1835 return (B_FALSE);
1836 1836 }
1837 1837 return (B_TRUE);
1838 1838 }
1839 1839
1840 1840 /*
1841 1841 * Return swap reserved by a segment backing a private mapping.
1842 1842 */
1843 1843 size_t
1844 1844 seg_swresv(struct seg *seg)
1845 1845 {
1846 1846 struct segvn_data *svd;
1847 1847 size_t swap = 0;
↓ open down ↓ |
1847 lines elided |
↑ open up ↑ |
1848 1848
1849 1849 if (seg->s_ops == &segvn_ops) {
1850 1850 svd = (struct segvn_data *)seg->s_data;
1851 1851 if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1852 1852 swap = svd->swresv;
1853 1853 }
1854 1854 return (swap);
1855 1855 }
1856 1856
1857 1857 /*
1858 - * General not supported function for segop_inherit
1859 - */
1860 -/* ARGSUSED */
1861 -int
1862 -seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1863 -{
1864 - return (ENOTSUP);
1865 -}
1866 -
1867 -/*
1868 1858 * segop wrappers
1869 1859 */
1870 1860 int
1871 1861 segop_dup(struct seg *seg, struct seg *new)
1872 1862 {
1873 1863 return (seg->s_ops->dup(seg, new));
1874 1864 }
1875 1865
1876 1866 int
1877 1867 segop_unmap(struct seg *seg, caddr_t addr, size_t len)
1878 1868 {
1879 1869 return (seg->s_ops->unmap(seg, addr, len));
1880 1870 }
1881 1871
1882 1872 void
1883 1873 segop_free(struct seg *seg)
1884 1874 {
1885 1875 seg->s_ops->free(seg);
1886 1876 }
1887 1877
1888 1878 faultcode_t
1889 1879 segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
1890 1880 enum fault_type type, enum seg_rw rw)
1891 1881 {
1892 1882 return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
1893 1883 }
1894 1884
1895 1885 faultcode_t
1896 1886 segop_faulta(struct seg *seg, caddr_t addr)
1897 1887 {
1898 1888 return (seg->s_ops->faulta(seg, addr));
1899 1889 }
1900 1890
1901 1891 int
1902 1892 segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1903 1893 {
1904 1894 return (seg->s_ops->setprot(seg, addr, len, prot));
1905 1895 }
1906 1896
1907 1897 int
1908 1898 segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1909 1899 {
1910 1900 return (seg->s_ops->checkprot(seg, addr, len, prot));
1911 1901 }
1912 1902
1913 1903 int
1914 1904 segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
1915 1905 {
1916 1906 return (seg->s_ops->kluster(seg, addr, d));
1917 1907 }
1918 1908
1919 1909 size_t
1920 1910 segop_swapout(struct seg *seg)
1921 1911 {
1922 1912 return (seg->s_ops->swapout(seg));
1923 1913 }
1924 1914
1925 1915 int
1926 1916 segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
1927 1917 {
1928 1918 return (seg->s_ops->sync(seg, addr, len, atr, f));
1929 1919 }
1930 1920
1931 1921 size_t
1932 1922 segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
1933 1923 {
1934 1924 return (seg->s_ops->incore(seg, addr, len, v));
1935 1925 }
1936 1926
1937 1927 int
1938 1928 segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
1939 1929 ulong_t *b, size_t p)
1940 1930 {
1941 1931 return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
1942 1932 }
1943 1933
1944 1934 int
1945 1935 segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
1946 1936 {
1947 1937 return (seg->s_ops->getprot(seg, addr, len, p));
1948 1938 }
1949 1939
1950 1940 u_offset_t
1951 1941 segop_getoffset(struct seg *seg, caddr_t addr)
1952 1942 {
1953 1943 return (seg->s_ops->getoffset(seg, addr));
1954 1944 }
1955 1945
1956 1946 int
1957 1947 segop_gettype(struct seg *seg, caddr_t addr)
1958 1948 {
1959 1949 return (seg->s_ops->gettype(seg, addr));
1960 1950 }
1961 1951
1962 1952 int
1963 1953 segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
1964 1954 {
1965 1955 return (seg->s_ops->getvp(seg, addr, vpp));
1966 1956 }
1967 1957
1968 1958 int
1969 1959 segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
1970 1960 {
1971 1961 return (seg->s_ops->advise(seg, addr, len, b));
1972 1962 }
1973 1963
1974 1964 void
1975 1965 segop_dump(struct seg *seg)
1976 1966 {
1977 1967 seg->s_ops->dump(seg);
1978 1968 }
1979 1969
1980 1970 int
1981 1971 segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
1982 1972 enum lock_type type, enum seg_rw rw)
1983 1973 {
1984 1974 return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
1985 1975 }
1986 1976
1987 1977 int
1988 1978 segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
1989 1979 {
1990 1980 return (seg->s_ops->setpagesize(seg, addr, len, szc));
1991 1981 }
1992 1982
1993 1983 int
1994 1984 segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
1995 1985 {
1996 1986 return (seg->s_ops->getmemid(seg, addr, mp));
1997 1987 }
1998 1988
1999 1989 struct lgrp_mem_policy_info *
2000 1990 segop_getpolicy(struct seg *seg, caddr_t addr)
2001 1991 {
2002 1992 if (seg->s_ops->getpolicy == NULL)
2003 1993 return (NULL);
2004 1994
2005 1995 return (seg->s_ops->getpolicy(seg, addr));
2006 1996 }
2007 1997
2008 1998 int
2009 1999 segop_capable(struct seg *seg, segcapability_t cap)
2010 2000 {
2011 2001 return (seg->s_ops->capable(seg, cap));
2012 2002 }
2013 2003
2014 2004 int
2015 2005 segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
2016 2006 {
2017 2007 if (seg->s_ops->inherit == NULL)
2018 2008 return (ENOTSUP);
2019 2009
2020 2010 return (seg->s_ops->inherit(seg, addr, len, op));
2021 2011 }
↓ open down ↓ |
144 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX