Print this page
4823 don't open-code NSEC2MSEC and MSEC2NSEC
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/rcap/rcapd/rcapd_main.c
+++ new/usr/src/cmd/rcap/rcapd/rcapd_main.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * rcapd is a long-running daemon enforcing project-based resource caps (see
28 28 * rcapd(1M)). Each instance of a process aggregate (project or, generically,
29 29 * "collection") may have a memory cap. A single thread monitors the resource
30 30 * utilization of capped collections, enforces caps when they are exceeded (and
31 31 * other conditions are met), and incorporates changes in configuration or
32 32 * caps. Each of these actions occurs not more frequently than the rate
33 33 * specified with rcapadm(1M).
34 34 */
35 35
36 36 #include <sys/priocntl.h>
37 37 #include <sys/proc.h>
38 38 #include <sys/resource.h>
39 39 #include <sys/sysinfo.h>
40 40 #include <sys/stat.h>
41 41 #include <sys/sysmacros.h>
42 42 #include <sys/time.h>
43 43 #include <sys/types.h>
44 44 #include <dirent.h>
45 45 #include <errno.h>
46 46 #include <fcntl.h>
47 47 #include <kstat.h>
48 48 #include <libintl.h>
49 49 #include <limits.h>
50 50 #include <locale.h>
51 51 #include <priv.h>
52 52 #include <signal.h>
53 53 #include <stdarg.h>
54 54 #include <stdio.h>
55 55 #include <stdio_ext.h>
56 56 #include <stdlib.h>
57 57 #include <libscf.h>
58 58 #include <strings.h>
59 59 #include <time.h>
60 60 #include <unistd.h>
61 61 #include <zone.h>
62 62 #include <assert.h>
63 63 #include <sys/vm_usage.h>
64 64 #include "rcapd.h"
65 65 #include "rcapd_mapping.h"
66 66 #include "rcapd_rfd.h"
67 67 #include "rcapd_stat.h"
68 68 #include "utils.h"
69 69
70 70 #define POSITIVE_MIN(x, y) \
71 71 (((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
72 72 #define NEXT_EVENT_TIME(base, seconds) \
73 73 (((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
74 74 : (hrtime_t)0)
75 75 #define NEXT_REPORT_EVENT_TIME(base, seconds) \
76 76 ((rcfg.rcfg_stat_file[0] != 0) ? \
77 77 NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
78 78 #define EVENT_TIME(time, eventtime) \
79 79 (((time) > (eventtime)) && (eventtime) != 0)
80 80 #define STAT_TEMPLATE_SUFFIX ".XXXXXX" /* suffix of mkstemp() arg */
81 81 #define DAEMON_UID 1 /* uid to use */
82 82
83 83 #define CAPPED_PROJECT 0x01
84 84 #define CAPPED_ZONE 0x02
85 85
86 86 typedef struct soft_scan_arg {
87 87 uint64_t ssa_sum_excess;
88 88 int64_t ssa_scan_goal;
89 89 boolean_t ssa_project_over_cap;
90 90 } soft_scan_arg_t;
91 91
92 92 typedef struct sample_col_arg {
93 93 boolean_t sca_any_over_cap;
94 94 boolean_t sca_project_over_cap;
95 95 } sample_col_arg_t;
96 96
97 97
98 98 static int debug_mode = 0; /* debug mode flag */
99 99 static pid_t rcapd_pid; /* rcapd's pid to ensure it's not */
100 100 /* scanned */
101 101 static kstat_ctl_t *kctl; /* kstat chain */
102 102 static int memory_pressure = 0; /* physical memory utilization (%) */
103 103 static int memory_pressure_sample = 0; /* count of samples */
104 104 static long page_size_kb = 0; /* system page size in KB */
105 105 static size_t nvmu_vals = 0; /* # of kernel RSS/swap vals in array */
106 106 static size_t vmu_vals_len = 0; /* size of RSS/swap vals array */
107 107 static vmusage_t *vmu_vals = NULL; /* snapshot of kernel RSS/swap values */
108 108 static hrtime_t next_report; /* time of next report */
109 109 static int termination_signal = 0; /* terminating signal */
110 110 static zoneid_t my_zoneid = (zoneid_t)-1;
111 111 static lcollection_t *gz_col; /* global zone collection */
112 112
113 113 rcfg_t rcfg;
114 114 /*
115 115 * Updated when we re-read the collection configurations if this rcapd instance
116 116 * is running in the global zone and the global zone is capped.
117 117 */
118 118 boolean_t gz_capped = B_FALSE;
119 119
120 120 /*
121 121 * Flags.
122 122 */
123 123 static int ever_ran;
124 124 int should_run;
125 125 static int should_reconfigure;
126 126
127 127 static int verify_statistics(void);
128 128 static int update_statistics(void);
129 129
130 130 /*
131 131 * Checks if a process is marked 'system'. Returns FALSE only when it is not.
132 132 */
133 133 static boolean_t
134 134 proc_issystem(pid_t pid)
135 135 {
136 136 char pc_clname[PC_CLNMSZ];
137 137
138 138 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
139 139 PC_KY_NULL) != -1) {
140 140 return (strcmp(pc_clname, "SYS") == 0);
141 141 } else {
142 142 debug("cannot get class-specific scheduling parameters; "
143 143 "assuming system process\n");
144 144 return (B_TRUE);
145 145 }
146 146 }
147 147
148 148 static void
149 149 lprocess_insert_mark(psinfo_t *psinfop)
150 150 {
151 151 pid_t pid = psinfop->pr_pid;
152 152 /* flag indicating whether the process should be scanned. */
153 153 int unscannable = psinfop->pr_nlwp == 0;
154 154 rcid_t colid;
155 155 lcollection_t *lcol;
156 156 lprocess_t *lproc;
157 157
158 158 /*
159 159 * Determine which collection to put this process into. We only have
160 160 * to worry about tracking both zone and project capped processes if
161 161 * this rcapd instance is running in the global zone, since we'll only
162 162 * see processes in our own projects in a non-global zone. In the
163 163 * global zone, if the process belongs to a non-global zone, we only
164 164 * need to track it for the capped non-global zone collection. For
165 165 * global zone processes, we first attempt to put the process into a
166 166 * capped project collection. On the second pass into this function
167 167 * the projid will be cleared so we will just track the process for the
168 168 * global zone collection as a whole.
169 169 */
170 170 if (psinfop->pr_zoneid == my_zoneid && psinfop->pr_projid != -1) {
171 171 colid.rcid_type = RCIDT_PROJECT;
172 172 colid.rcid_val = psinfop->pr_projid;
173 173 } else {
174 174 /* try to add to zone collection */
175 175 colid.rcid_type = RCIDT_ZONE;
176 176 colid.rcid_val = psinfop->pr_zoneid;
177 177 }
178 178
179 179 if ((lcol = lcollection_find(&colid)) == NULL)
180 180 return;
181 181
182 182 /*
183 183 * If the process is already being tracked, update the unscannable flag,
184 184 * as determined by the caller, from the process's psinfo.
185 185 */
186 186 lproc = lcol->lcol_lprocess;
187 187 while (lproc != NULL) {
188 188 if (lproc->lpc_pid == pid) {
189 189 lproc->lpc_mark = 1;
190 190 if (unscannable != 0 && lproc->lpc_unscannable == 0) {
191 191 debug("process %d: became unscannable\n",
192 192 (int)lproc->lpc_pid);
193 193 lproc->lpc_unscannable = 1;
194 194 }
195 195 return;
196 196 }
197 197 lproc = lproc->lpc_next;
198 198 }
199 199
200 200 /*
201 201 * We've fallen off the list without finding our current process;
202 202 * insert it at the list head.
203 203 */
204 204 if ((lproc = malloc(sizeof (*lproc))) == NULL)
205 205 debug("insufficient memory to track new process %d", (int)pid);
206 206 else {
207 207 (void) bzero(lproc, sizeof (*lproc));
208 208 lproc->lpc_pid = pid;
209 209 lproc->lpc_mark = 1;
210 210 lproc->lpc_collection = lcol;
211 211 lproc->lpc_psinfo_fd = -1;
212 212 lproc->lpc_pgdata_fd = -1;
213 213 lproc->lpc_xmap_fd = -1;
214 214
215 215 /*
216 216 * If the caller didn't flag this process as unscannable
217 217 * already, do some more checking.
218 218 */
219 219 lproc->lpc_unscannable = unscannable || proc_issystem(pid);
220 220
221 221 #ifdef DEBUG
222 222 /*
223 223 * Verify the sanity of lprocess. It should not contain the
224 224 * process we are about to prepend.
225 225 */
226 226 if (lcollection_member(lcol, lproc)) {
227 227 lprocess_t *cur = lcol->lcol_lprocess;
228 228 debug("The collection %lld already has these members, "
229 229 "including me, %d!\n",
230 230 (long long)lcol->lcol_id.rcid_val,
231 231 (int)lproc->lpc_pid);
232 232 while (cur != NULL) {
233 233 debug("\t%d\n", (int)cur->lpc_pid);
234 234 cur = cur->lpc_next;
235 235 }
236 236 info(gettext("process already on lprocess\n"));
237 237 abort();
238 238 }
239 239 #endif /* DEBUG */
240 240 lproc->lpc_next = lcol->lcol_lprocess;
241 241 if (lproc->lpc_next != NULL)
242 242 lproc->lpc_next->lpc_prev = lproc;
243 243 lproc->lpc_prev = NULL;
244 244 lcol->lcol_lprocess = lproc;
245 245
246 246 debug("tracking %s %ld %d %s%s\n",
247 247 (colid.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
248 248 (long)colid.rcid_val,
249 249 (int)pid, psinfop->pr_psargs,
250 250 (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
251 251 lcol->lcol_stat.lcols_proc_in++;
252 252 }
253 253 }
254 254
255 255 static int
256 256 list_walk_process_cb(lcollection_t *lcol, void *arg)
257 257 {
258 258 int (*cb)(lcollection_t *, lprocess_t *) =
259 259 (int(*)(lcollection_t *, lprocess_t *))arg;
260 260 lprocess_t *member;
261 261 lprocess_t *next;
262 262
263 263 member = lcol->lcol_lprocess;
264 264 while (member != NULL) {
265 265 pid_t pid = member->lpc_pid;
266 266 next = member->lpc_next;
267 267
268 268 debug_high("list_walk_all lpc %d\n", (int)pid);
269 269 if (cb(lcol, member) != 0) {
270 270 debug_high("list_walk_all aborted at lpc %d\n",
271 271 (int)pid);
272 272 return (1);
273 273 }
274 274 member = next;
275 275 }
276 276
277 277 return (0);
278 278 }
279 279
280 280 /*
281 281 * Invoke the given callback for each process in each collection. Callbacks
282 282 * are allowed to change the linkage of the process on which they act.
283 283 */
284 284 static void
285 285 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
286 286 {
287 287 list_walk_collection(list_walk_process_cb, (void *)cb);
288 288 }
289 289
290 290 static void
291 291 revoke_psinfo(rfd_t *rfd)
292 292 {
293 293 lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
294 294
295 295 if (lpc != NULL) {
296 296 debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
297 297 ASSERT(lpc->lpc_psinfo_fd != -1);
298 298 lpc->lpc_psinfo_fd = -1;
299 299 } else
300 300 debug("revoking psinfo fd for unknown process\n");
301 301 }
302 302
303 303 /*
304 304 * Retrieve a process's psinfo via an already-opened or new file descriptor.
305 305 * The supplied descriptor will be closed on failure. An optional callback
306 306 * will be invoked with the last descriptor tried, and a supplied callback
307 307 * argument, as its arguments, such that the new descriptor may be cached, or
308 308 * an old one may be invalidated. If the result of the callback is zero, the
309 309 * the caller is to assume responsibility for the file descriptor, to close it
310 310 * with rfd_close().
311 311 *
312 312 * On failure, a nonzero value is returned.
313 313 */
314 314 int
315 315 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
316 316 int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
317 317 {
318 318 int fd;
319 319 int can_try_uncached;
320 320
321 321 ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
322 322
323 323 do {
324 324 if (cached_fd >= 0) {
325 325 fd = cached_fd;
326 326 can_try_uncached = 1;
327 327 debug_high("%d/psinfo, trying cached fd %d\n",
328 328 (int)pid, fd);
329 329 } else {
330 330 char pathbuf[PROC_PATH_MAX];
331 331
332 332 can_try_uncached = 0;
333 333 (void) snprintf(pathbuf, sizeof (pathbuf),
334 334 "/proc/%d/psinfo", (int)pid);
335 335 if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
336 336 revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
337 337 debug("cannot open %s", pathbuf);
338 338 break;
339 339 } else
340 340 debug_high("opened %s, fd %d\n", pathbuf, fd);
341 341 }
342 342
343 343 if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
344 344 sizeof (*psinfo) && psinfo->pr_pid == pid)
345 345 break;
346 346 else {
347 347 debug_high("closed fd %d\n", fd);
348 348 if (rfd_close(fd) != 0)
349 349 debug("could not close fd %d", fd);
350 350 fd = cached_fd = -1;
351 351 }
352 352 } while (can_try_uncached == 1);
353 353
354 354 if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
355 355 if (fd >= 0) {
356 356 debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
357 357 "uncached" : "cached", fd);
358 358 if (rfd_close(fd) != 0)
359 359 debug("could not close fd %d", fd);
360 360 }
361 361
362 362 debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
363 363 fd_update_cb != NULL ? "cached" : "uncached");
364 364 return ((fd >= 0) ? 0 : -1);
365 365 }
366 366
367 367 /*
368 368 * Retrieve the collection membership of all processes and update the psinfo of
369 369 * those non-system, non-zombie ones in collections. For global zone processes,
370 370 * we first attempt to put the process into a capped project collection. We
371 371 * also want to track the process for the global zone collection as a whole.
372 372 */
373 373 static void
374 374 proc_cb(const pid_t pid)
375 375 {
376 376 psinfo_t psinfo;
377 377
378 378 if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0) {
379 379 lprocess_insert_mark(&psinfo);
380 380 if (gz_capped && psinfo.pr_zoneid == GLOBAL_ZONEID) {
381 381 /*
382 382 * We also want to track this process for the global
383 383 * zone as a whole so add it to the global zone
384 384 * collection as well.
385 385 */
386 386 psinfo.pr_projid = -1;
387 387 lprocess_insert_mark(&psinfo);
388 388 }
389 389 }
390 390 }
391 391
392 392 /*
393 393 * Cache the process' psinfo fd, taking responsibility for freeing it.
394 394 */
395 395 int
396 396 lprocess_update_psinfo_fd_cb(void *arg, int fd)
397 397 {
398 398 lprocess_t *lpc = arg;
399 399
400 400 lpc->lpc_psinfo_fd = fd;
401 401 return (0);
402 402 }
403 403
404 404 /*
405 405 * Get the system pagesize.
406 406 */
407 407 static void
408 408 get_page_size(void)
409 409 {
410 410 page_size_kb = sysconf(_SC_PAGESIZE) / 1024;
411 411 debug("physical page size: %luKB\n", page_size_kb);
412 412 }
413 413
414 414 static void
415 415 tm_fmt(char *msg, hrtime_t t1, hrtime_t t2)
416 416 {
417 417 hrtime_t diff = t2 - t1;
418 418
419 419 if (diff < MILLISEC)
420 420 debug("%s: %lld nanoseconds\n", msg, diff);
421 421 else if (diff < MICROSEC)
422 422 debug("%s: %.2f microseconds\n", msg, (float)diff / MILLISEC);
423 423 else if (diff < NANOSEC)
424 424 debug("%s: %.2f milliseconds\n", msg, (float)diff / MICROSEC);
425 425 else
426 426 debug("%s: %.2f seconds\n", msg, (float)diff / NANOSEC);
427 427 }
428 428
429 429 /*
430 430 * Get the zone's & project's RSS from the kernel.
431 431 */
432 432 static void
433 433 rss_sample(boolean_t my_zone_only, uint_t col_types)
434 434 {
435 435 size_t nres;
436 436 size_t i;
437 437 uint_t flags;
438 438 hrtime_t t1, t2;
439 439
440 440 if (my_zone_only) {
441 441 flags = VMUSAGE_ZONE;
442 442 } else {
443 443 flags = 0;
444 444 if (col_types & CAPPED_PROJECT)
445 445 flags |= VMUSAGE_PROJECTS;
446 446 if (col_types & CAPPED_ZONE && my_zoneid == GLOBAL_ZONEID)
447 447 flags |= VMUSAGE_ALL_ZONES;
448 448 }
449 449
450 450 debug("vmusage sample flags 0x%x\n", flags);
451 451 if (flags == 0)
452 452 return;
453 453
454 454 again:
455 455 /* try the current buffer to see if the list will fit */
456 456 nres = vmu_vals_len;
457 457 t1 = gethrtime();
458 458 if (getvmusage(flags, my_zone_only ? 0 : rcfg.rcfg_rss_sample_interval,
459 459 vmu_vals, &nres) != 0) {
460 460 if (errno != EOVERFLOW) {
461 461 warn(gettext("can't read RSS from kernel\n"));
462 462 return;
463 463 }
464 464 }
465 465 t2 = gethrtime();
466 466 tm_fmt("getvmusage time", t1, t2);
467 467
468 468 debug("kernel nres %lu\n", (ulong_t)nres);
469 469
470 470 if (nres > vmu_vals_len) {
471 471 /* array size is now too small, increase it and try again */
472 472 free(vmu_vals);
473 473
474 474 if ((vmu_vals = (vmusage_t *)calloc(nres,
475 475 sizeof (vmusage_t))) == NULL) {
476 476 warn(gettext("out of memory: could not read RSS from "
477 477 "kernel\n"));
478 478 vmu_vals_len = nvmu_vals = 0;
479 479 return;
480 480 }
481 481 vmu_vals_len = nres;
482 482 goto again;
483 483 }
484 484
485 485 nvmu_vals = nres;
486 486
487 487 debug("vmusage_sample\n");
488 488 for (i = 0; i < nvmu_vals; i++) {
489 489 debug("%d: id: %d, type: 0x%x, rss_all: %llu (%lluKB), "
490 490 "swap: %llu\n", (int)i, (int)vmu_vals[i].vmu_id,
491 491 vmu_vals[i].vmu_type,
492 492 (unsigned long long)vmu_vals[i].vmu_rss_all,
493 493 (unsigned long long)vmu_vals[i].vmu_rss_all / 1024,
494 494 (unsigned long long)vmu_vals[i].vmu_swap_all);
495 495 }
496 496 }
497 497
498 498 static void
499 499 update_col_rss(lcollection_t *lcol)
500 500 {
501 501 int i;
502 502
503 503 lcol->lcol_rss = 0;
504 504 lcol->lcol_image_size = 0;
505 505
506 506 for (i = 0; i < nvmu_vals; i++) {
507 507 if (vmu_vals[i].vmu_id != lcol->lcol_id.rcid_val)
508 508 continue;
509 509
510 510 if (vmu_vals[i].vmu_type == VMUSAGE_ZONE &&
511 511 lcol->lcol_id.rcid_type != RCIDT_ZONE)
512 512 continue;
513 513
514 514 if (vmu_vals[i].vmu_type == VMUSAGE_PROJECTS &&
515 515 lcol->lcol_id.rcid_type != RCIDT_PROJECT)
516 516 continue;
517 517
518 518 /* we found the right RSS entry, update the collection vals */
519 519 lcol->lcol_rss = vmu_vals[i].vmu_rss_all / 1024;
520 520 lcol->lcol_image_size = vmu_vals[i].vmu_swap_all / 1024;
521 521 break;
522 522 }
523 523 }
524 524
525 525 /*
526 526 * Sample the collection RSS, updating the collection's statistics with the
527 527 * results. Also, sum the rss of all capped projects & return true if
528 528 * the collection is over cap.
529 529 */
530 530 static int
531 531 rss_sample_col_cb(lcollection_t *lcol, void *arg)
532 532 {
533 533 int64_t excess;
534 534 uint64_t rss;
535 535 sample_col_arg_t *col_argp = (sample_col_arg_t *)arg;
536 536
537 537 update_col_rss(lcol);
538 538
539 539 lcol->lcol_stat.lcols_rss_sample++;
540 540 rss = lcol->lcol_rss;
541 541 excess = rss - lcol->lcol_rss_cap;
542 542 if (excess > 0) {
543 543 lcol->lcol_stat.lcols_rss_act_sum += rss;
544 544 col_argp->sca_any_over_cap = B_TRUE;
545 545 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
546 546 col_argp->sca_project_over_cap = B_TRUE;
547 547 }
548 548 lcol->lcol_stat.lcols_rss_sum += rss;
549 549
550 550 if (lcol->lcol_stat.lcols_min_rss > rss)
551 551 lcol->lcol_stat.lcols_min_rss = rss;
552 552 if (lcol->lcol_stat.lcols_max_rss < rss)
553 553 lcol->lcol_stat.lcols_max_rss = rss;
554 554
555 555 return (0);
556 556 }
557 557
558 558 /*
559 559 * Determine if we have capped projects, capped zones or both.
560 560 */
561 561 static int
562 562 col_type_cb(lcollection_t *lcol, void *arg)
563 563 {
564 564 uint_t *col_type = (uint_t *)arg;
565 565
566 566 /* skip uncapped collections */
567 567 if (lcol->lcol_rss_cap == 0)
568 568 return (1);
569 569
570 570 if (lcol->lcol_id.rcid_type == RCIDT_PROJECT)
571 571 *col_type |= CAPPED_PROJECT;
572 572 else
573 573 *col_type |= CAPPED_ZONE;
574 574
575 575 /* once we know everything is capped, we can stop looking */
576 576 if ((*col_type & CAPPED_ZONE) && (*col_type & CAPPED_PROJECT))
577 577 return (1);
578 578
579 579 return (0);
580 580 }
581 581
582 582 /*
583 583 * Open /proc and walk entries.
584 584 */
585 585 static void
586 586 proc_walk_all(void (*cb)(const pid_t))
587 587 {
588 588 DIR *pdir;
589 589 struct dirent *dirent;
590 590 pid_t pid;
591 591
592 592 (void) rfd_reserve(1);
593 593 if ((pdir = opendir("/proc")) == NULL)
594 594 die(gettext("couldn't open /proc!"));
595 595
596 596 while ((dirent = readdir(pdir)) != NULL) {
597 597 if (strcmp(".", dirent->d_name) == 0 ||
598 598 strcmp("..", dirent->d_name) == 0)
599 599 continue;
600 600 pid = atoi(dirent->d_name);
601 601 ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
602 602 if (pid == rcapd_pid)
603 603 continue;
604 604 else
605 605 cb(pid);
606 606 }
607 607 (void) closedir(pdir);
608 608 }
609 609
610 610 /*
611 611 * Clear unmarked callback.
612 612 */
613 613 /*ARGSUSED*/
614 614 static int
615 615 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
616 616 {
617 617 if (lpc->lpc_mark) {
618 618 lpc->lpc_mark = 0;
619 619 } else {
620 620 debug("process %d finished\n", (int)lpc->lpc_pid);
621 621 lprocess_free(lpc);
622 622 }
623 623
624 624 return (0);
625 625 }
626 626
627 627 /*
628 628 * Print, for debugging purposes, a collection's recently-sampled RSS and
629 629 * excess.
630 630 */
631 631 /*ARGSUSED*/
632 632 static int
633 633 excess_print_cb(lcollection_t *lcol, void *arg)
634 634 {
635 635 int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
636 636
637 637 debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
638 638 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
639 639 lcol->lcol_name,
640 640 (unsigned long long)lcol->lcol_rss,
641 641 (unsigned long long)lcol->lcol_rss_cap,
642 642 (long long)excess);
643 643
644 644 return (0);
645 645 }
646 646
647 647 /*
648 648 * Scan those collections which have exceeded their caps.
649 649 *
650 650 * If we're running in the global zone it might have a cap. We don't want to
651 651 * do any capping for the global zone yet since we might get under the cap by
652 652 * just capping the projects in the global zone.
653 653 */
654 654 /*ARGSUSED*/
655 655 static int
656 656 scan_cb(lcollection_t *lcol, void *arg)
657 657 {
658 658 int64_t excess;
659 659
660 660 /* skip over global zone collection for now but keep track for later */
661 661 if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
662 662 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
663 663 gz_col = lcol;
664 664 return (0);
665 665 }
666 666
667 667 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
668 668 scan(lcol, excess);
669 669 lcol->lcol_stat.lcols_scan++;
670 670 }
671 671
672 672 return (0);
673 673 }
674 674
675 675 /*
676 676 * Scan the global zone collection and see if it still exceeds its cap.
677 677 * We take into account the effects of capping any global zone projects here.
678 678 */
679 679 static void
680 680 scan_gz(lcollection_t *lcol, boolean_t project_over_cap)
681 681 {
682 682 int64_t excess;
683 683
684 684 /*
685 685 * If we had projects over their cap and the global zone was also over
686 686 * its cap then we need to get the up-to-date global zone rss to
687 687 * determine if we are still over the global zone cap. We might have
688 688 * gone under while we scanned the capped projects. If there were no
689 689 * projects over cap then we can use the rss value we already have for
690 690 * the global zone.
691 691 */
692 692 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
693 693 if (project_over_cap && excess > 0) {
694 694 rss_sample(B_TRUE, CAPPED_ZONE);
695 695 update_col_rss(lcol);
696 696 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
697 697 }
698 698
699 699 if (excess > 0) {
700 700 debug("global zone excess %lldKB\n", (long long)excess);
701 701 scan(lcol, excess);
702 702 lcol->lcol_stat.lcols_scan++;
703 703 }
704 704 }
705 705
706 706 /*
707 707 * Do a soft scan of those collections which have excesses. A soft scan is one
708 708 * in which the cap enforcement pressure is taken into account. The difference
709 709 * between the utilized physical memory and the cap enforcement pressure will
710 710 * be scanned-for, and each collection will be scanned proportionally by their
711 711 * present excesses.
712 712 */
713 713 static int
714 714 soft_scan_cb(lcollection_t *lcol, void *a)
715 715 {
716 716 int64_t excess;
717 717 soft_scan_arg_t *arg = a;
718 718
719 719 /* skip over global zone collection for now but keep track for later */
720 720 if (lcol->lcol_id.rcid_type == RCIDT_ZONE &&
721 721 lcol->lcol_id.rcid_val == GLOBAL_ZONEID) {
722 722 gz_col = lcol;
723 723 return (0);
724 724 }
725 725
726 726 if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
727 727 int64_t adjusted_excess =
728 728 excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
729 729
730 730 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
731 731 "scanning %lld\n",
732 732 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
733 733 "project" : "zone"),
734 734 (long)lcol->lcol_id.rcid_val,
735 735 (long long)excess, (long long)arg->ssa_scan_goal,
736 736 (unsigned long long)arg->ssa_sum_excess,
737 737 (long long)adjusted_excess);
738 738
739 739 scan(lcol, adjusted_excess);
740 740 lcol->lcol_stat.lcols_scan++;
741 741 }
742 742
743 743 return (0);
744 744 }
745 745
746 746 static void
747 747 soft_scan_gz(lcollection_t *lcol, void *a)
748 748 {
749 749 int64_t excess;
750 750 soft_scan_arg_t *arg = a;
751 751
752 752 /*
753 753 * If we had projects over their cap and the global zone was also over
754 754 * its cap then we need to get the up-to-date global zone rss to
755 755 * determine if we are still over the global zone cap. We might have
756 756 * gone under while we scanned the capped projects. If there were no
757 757 * projects over cap then we can use the rss value we already have for
758 758 * the global zone.
759 759 */
760 760 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
761 761 if (arg->ssa_project_over_cap && excess > 0) {
762 762 rss_sample(B_TRUE, CAPPED_ZONE);
763 763 update_col_rss(lcol);
764 764 excess = lcol->lcol_rss - lcol->lcol_rss_cap;
765 765 }
766 766
767 767 if (excess > 0) {
768 768 int64_t adjusted_excess =
769 769 excess * arg->ssa_scan_goal / arg->ssa_sum_excess;
770 770
771 771 debug("%s %ld excess %lld scan_goal %lld sum_excess %llu, "
772 772 "scanning %lld\n",
773 773 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
774 774 "project" : "zone"),
775 775 (long)lcol->lcol_id.rcid_val,
776 776 (long long)excess, (long long)arg->ssa_scan_goal,
777 777 (unsigned long long)arg->ssa_sum_excess,
778 778 (long long)adjusted_excess);
779 779
780 780 scan(lcol, adjusted_excess);
781 781 lcol->lcol_stat.lcols_scan++;
782 782 }
783 783 }
784 784
785 785 /*
786 786 * When a scan could happen, but caps aren't enforced tick the
787 787 * lcols_unenforced_cap counter.
788 788 */
789 789 /*ARGSUSED*/
790 790 static int
791 791 unenforced_cap_cb(lcollection_t *lcol, void *arg)
792 792 {
793 793 lcol->lcol_stat.lcols_unenforced_cap++;
794 794
795 795 return (0);
796 796 }
797 797
798 798 /*
799 799 * Update the count of physically installed memory.
800 800 */
801 801 static void
802 802 update_phys_total(void)
803 803 {
804 804 uint64_t old_phys_total;
805 805
806 806 old_phys_total = phys_total;
807 807 phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * page_size_kb;
808 808 if (phys_total != old_phys_total)
809 809 debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
810 810 "" : " adjusted"), (unsigned long long)(phys_total / 1024));
811 811 }
812 812
813 813 /*
814 814 * Unlink a process from its collection, updating relevant statistics, and
815 815 * freeing its associated memory.
816 816 */
817 817 void
818 818 lprocess_free(lprocess_t *lpc)
819 819 {
820 820 pid_t pid;
821 821
822 822 lpc->lpc_collection->lcol_stat.lcols_proc_out++;
823 823
824 824 if (lpc->lpc_prev != NULL)
825 825 lpc->lpc_prev->lpc_next = lpc->lpc_next;
826 826 if (lpc->lpc_next != NULL)
827 827 lpc->lpc_next->lpc_prev = lpc->lpc_prev;
828 828 if (lpc->lpc_collection->lcol_lprocess == lpc)
829 829 lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
830 830 lpc ? lpc->lpc_next : NULL);
831 831 lpc->lpc_next = lpc->lpc_prev = NULL;
832 832
833 833 if (lpc->lpc_prpageheader != NULL)
834 834 free(lpc->lpc_prpageheader);
835 835 if (lpc->lpc_xmap != NULL)
836 836 free(lpc->lpc_xmap);
837 837 if (lpc->lpc_psinfo_fd >= 0) {
838 838 if (rfd_close(lpc->lpc_psinfo_fd) != 0)
839 839 debug("could not close %d lpc_psinfo_fd %d",
840 840 (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
841 841 lpc->lpc_psinfo_fd = -1;
842 842 }
843 843 if (lpc->lpc_pgdata_fd >= 0) {
844 844 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
845 845 debug("could not close %d lpc_pgdata_fd %d",
846 846 (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
847 847 lpc->lpc_pgdata_fd = -1;
848 848 }
849 849 if (lpc->lpc_xmap_fd >= 0) {
850 850 if (rfd_close(lpc->lpc_xmap_fd) != 0)
851 851 debug("could not close %d lpc_xmap_fd %d",
852 852 (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
853 853 lpc->lpc_xmap_fd = -1;
854 854 }
855 855 if (lpc->lpc_ignore != NULL)
856 856 lmapping_free(&lpc->lpc_ignore);
857 857 pid = lpc->lpc_pid;
858 858 free(lpc);
859 859 debug_high("process %d freed\n", (int)pid);
860 860 }
861 861
862 862 /*
863 863 * Collection clear callback.
864 864 */
865 865 /*ARGSUSED*/
866 866 static int
867 867 collection_clear_cb(lcollection_t *lcol, void *arg)
868 868 {
869 869 lcol->lcol_mark = 0;
870 870
871 871 return (0);
872 872 }
873 873
874 874 /*
875 875 * Respond to a terminating signal by setting a termination flag.
876 876 */
877 877 /*ARGSUSED*/
878 878 static void
879 879 terminate_signal(int signal)
880 880 {
881 881 if (termination_signal == 0)
882 882 termination_signal = signal;
883 883 should_run = 0;
884 884 }
885 885
886 886 /*
887 887 * Handle any synchronous or asynchronous signals that would ordinarily cause a
888 888 * process to abort.
889 889 */
890 890 /*ARGSUSED*/
891 891 static void
892 892 abort_signal(int signal)
893 893 {
894 894 /*
895 895 * Allow the scanner to make a last-ditch effort to resume any stopped
896 896 * processes.
897 897 */
898 898 scan_abort();
899 899 abort();
900 900 }
901 901
902 902 /*
903 903 * Clean up collections which have been removed due to configuration. Unlink
904 904 * the collection from lcollection and free it.
905 905 */
906 906 /*ARGSUSED*/
907 907 static int
908 908 collection_sweep_cb(lcollection_t *lcol, void *arg)
909 909 {
910 910 if (lcol->lcol_mark == 0) {
911 911 debug("freeing %s %s\n",
912 912 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
913 913 "project" : "zone"), lcol->lcol_name);
914 914 lcollection_free(lcol);
915 915 }
916 916
917 917 return (0);
918 918 }
919 919
920 920 /*
921 921 * Set those variables which depend on the global configuration.
922 922 */
923 923 static void
924 924 finish_configuration(void)
925 925 {
926 926 /*
927 927 * Warn that any lnode (or non-project) mode specification (by an SRM
928 928 * 1.3 configuration file, for example) is ignored.
929 929 */
930 930 if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
931 931 warn(gettext("%s mode specification ignored -- using project"
932 932 " mode\n"), rcfg.rcfg_mode_name);
933 933 rcfg.rcfg_mode_name = "project";
934 934 rcfg.rcfg_mode = rctype_project;
935 935 }
936 936 }
937 937
938 938 /*
939 939 * Cause the configuration to be reread and applied.
940 940 */
941 941 static void
942 942 reread_configuration(void)
943 943 {
944 944 rcfg_t rcfg_new;
945 945
946 946 if (rcfg_read(&rcfg_new, update_statistics) != E_SUCCESS) {
947 947 warn(gettext("can't reread configuration \n"));
948 948 exit(SMF_EXIT_ERR_CONFIG);
949 949 } else {
950 950 /*
951 951 * Done reading configuration. Remove existing
952 952 * collections in case there is a change in collection type.
953 953 */
954 954 if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
955 955 list_walk_collection(collection_clear_cb, NULL);
956 956 list_walk_collection(collection_sweep_cb, NULL);
957 957 }
958 958
959 959 /*
960 960 * Make the newly-read configuration the global one, and update
961 961 * any variables that depend on it.
962 962 */
963 963 rcfg = rcfg_new;
964 964 finish_configuration();
965 965 }
966 966 }
967 967
968 968 /*
969 969 * First, examine changes, additions, and deletions to cap definitions.
970 970 * Then, set the next event time.
971 971 */
972 972 static void
973 973 reconfigure(hrtime_t now, hrtime_t *next_configuration,
974 974 hrtime_t *next_proc_walk, hrtime_t *next_rss_sample)
975 975 {
976 976 debug("reconfigure...\n");
977 977
978 978 /*
979 979 * Walk the lcollection, marking active collections so inactive ones
980 980 * can be freed.
981 981 */
982 982 list_walk_collection(collection_clear_cb, NULL);
983 983 lcollection_update(LCU_ACTIVE_ONLY); /* mark */
984 984 list_walk_collection(collection_sweep_cb, NULL);
985 985
986 986 *next_configuration = NEXT_EVENT_TIME(now,
987 987 rcfg.rcfg_reconfiguration_interval);
988 988
989 989 /*
990 990 * Reset each event time to the shorter of the previous and new
991 991 * intervals.
992 992 */
993 993 if (next_report == 0 && rcfg.rcfg_report_interval > 0)
994 994 next_report = now;
995 995 else
996 996 next_report = POSITIVE_MIN(next_report,
997 997 NEXT_REPORT_EVENT_TIME(now, rcfg.rcfg_report_interval));
998 998
999 999 if (*next_proc_walk == 0 && rcfg.rcfg_proc_walk_interval > 0)
1000 1000 *next_proc_walk = now;
1001 1001 else
1002 1002 *next_proc_walk = POSITIVE_MIN(*next_proc_walk,
1003 1003 NEXT_EVENT_TIME(now, rcfg.rcfg_proc_walk_interval));
1004 1004
1005 1005 if (*next_rss_sample == 0 && rcfg.rcfg_rss_sample_interval > 0)
1006 1006 *next_rss_sample = now;
1007 1007 else
1008 1008 *next_rss_sample = POSITIVE_MIN(*next_rss_sample,
1009 1009 NEXT_EVENT_TIME(now, rcfg.rcfg_rss_sample_interval));
1010 1010 }
1011 1011
1012 1012 /*
1013 1013 * Respond to SIGHUP by triggering the rereading the configuration and cap
1014 1014 * definitions.
1015 1015 */
1016 1016 /*ARGSUSED*/
1017 1017 static void
1018 1018 sighup(int signal)
1019 1019 {
1020 1020 should_reconfigure = 1;
1021 1021 }
1022 1022
1023 1023 /*
1024 1024 * Print, for debugging purposes, each collection's interval statistics.
1025 1025 */
1026 1026 /*ARGSUSED*/
1027 1027 static int
1028 1028 simple_report_collection_cb(lcollection_t *lcol, void *arg)
1029 1029 {
1030 1030 #define DELTA(field) \
1031 1031 (unsigned long long)( \
1032 1032 (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
1033 1033
1034 1034 debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
1035 1035 "ineffective/scans/unenforced/samplings: %llu/%llu/%llu/%llu, RSS "
1036 1036 "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
1037 1037 "%llu scans over %llu ms\n",
↓ open down ↓ |
1037 lines elided |
↑ open up ↑ |
1038 1038 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
1039 1039 lcol->lcol_name,
1040 1040 DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
1041 1041 DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
1042 1042 DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
1043 1043 (unsigned long long)lcol->lcol_stat.lcols_min_rss,
1044 1044 (unsigned long long)lcol->lcol_stat.lcols_max_rss,
1045 1045 (unsigned long long)lcol->lcol_rss_cap,
1046 1046 (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
1047 1047 lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
1048 - DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
1049 - / MILLISEC));
1048 + DELTA(lcols_scan_count),
1049 + NSEC2MSEC(DELTA(lcols_scan_time_complete)));
1050 1050
1051 1051 #undef DELTA
1052 1052
1053 1053 return (0);
1054 1054 }
1055 1055
1056 1056 /*
1057 1057 * Record each collection's interval statistics in the statistics file.
1058 1058 */
1059 1059 static int
1060 1060 report_collection_cb(lcollection_t *lcol, void *arg)
1061 1061 {
1062 1062 lcollection_report_t dc;
1063 1063 int fd = (intptr_t)arg;
1064 1064
1065 1065 /*
1066 1066 * Copy the relevant fields to the collection's record.
1067 1067 */
1068 1068 bzero(&dc, sizeof (dc));
1069 1069 dc.lcol_id = lcol->lcol_id;
1070 1070 (void) strcpy(dc.lcol_name, lcol->lcol_name);
1071 1071 dc.lcol_rss = lcol->lcol_rss;
1072 1072 dc.lcol_image_size = lcol->lcol_image_size;
1073 1073 dc.lcol_rss_cap = lcol->lcol_rss_cap;
1074 1074 dc.lcol_stat = lcol->lcol_stat;
1075 1075
1076 1076 if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
1077 1077 lcol->lcol_stat_old = lcol->lcol_stat;
1078 1078 } else {
1079 1079 debug("can't write %s %s statistics",
1080 1080 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ?
1081 1081 "project" : "zone"),
1082 1082 lcol->lcol_name);
1083 1083 }
1084 1084
1085 1085 return (0);
1086 1086 }
1087 1087
1088 1088 /*
1089 1089 * Determine the count of pages scanned by the global page scanner, obtained
1090 1090 * from the cpu_stat:*::scan kstats. Return zero on success.
1091 1091 */
1092 1092 static int
1093 1093 get_globally_scanned_pages(uint64_t *scannedp)
1094 1094 {
1095 1095 kstat_t *ksp;
1096 1096 uint64_t scanned = 0;
1097 1097
1098 1098 if (kstat_chain_update(kctl) == -1) {
1099 1099 warn(gettext("can't update kstat chain"));
1100 1100 return (0);
1101 1101 }
1102 1102
1103 1103 for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
1104 1104 if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
1105 1105 if (kstat_read(kctl, ksp, NULL) != -1) {
1106 1106 scanned += ((cpu_stat_t *)
1107 1107 ksp->ks_data)->cpu_vminfo.scan;
1108 1108 } else {
1109 1109 return (-1);
1110 1110 }
1111 1111 }
1112 1112 }
1113 1113
1114 1114 *scannedp = scanned;
1115 1115 return (0);
1116 1116 }
1117 1117
1118 1118 /*
1119 1119 * Determine if the global page scanner is running, during which no memory
1120 1120 * caps should be enforced, to prevent interference with the global page
1121 1121 * scanner.
1122 1122 */
1123 1123 static boolean_t
1124 1124 is_global_scanner_running()
1125 1125 {
1126 1126 /* measure delta in page scan count */
1127 1127 static uint64_t new_sp = 0;
1128 1128 static uint64_t old_sp = 0;
1129 1129 boolean_t res = B_FALSE;
1130 1130
1131 1131 if (get_globally_scanned_pages(&new_sp) == 0) {
1132 1132 if (old_sp != 0 && (new_sp - old_sp) > 0) {
1133 1133 debug("global memory pressure detected (%llu "
1134 1134 "pages scanned since last interval)\n",
1135 1135 (unsigned long long)(new_sp - old_sp));
1136 1136 res = B_TRUE;
1137 1137 }
1138 1138 old_sp = new_sp;
1139 1139 } else {
1140 1140 warn(gettext("unable to read cpu statistics"));
1141 1141 new_sp = old_sp;
1142 1142 }
1143 1143
1144 1144 return (res);
1145 1145 }
1146 1146
1147 1147 /*
1148 1148 * If soft caps are in use, determine if global memory pressure exceeds the
1149 1149 * configured maximum above which soft caps are enforced.
1150 1150 */
1151 1151 static boolean_t
1152 1152 must_enforce_soft_caps()
1153 1153 {
1154 1154 /*
1155 1155 * Check for changes to the amount of installed physical memory, to
1156 1156 * compute the current memory pressure.
1157 1157 */
1158 1158 update_phys_total();
1159 1159
1160 1160 memory_pressure = 100 - (int)((sysconf(_SC_AVPHYS_PAGES) * page_size_kb)
1161 1161 * 100.0 / phys_total);
1162 1162 memory_pressure_sample++;
1163 1163 if (rcfg.rcfg_memory_cap_enforcement_pressure > 0 &&
1164 1164 memory_pressure > rcfg.rcfg_memory_cap_enforcement_pressure) {
1165 1165 return (B_TRUE);
1166 1166 }
1167 1167
1168 1168 return (B_FALSE);
1169 1169 }
1170 1170
1171 1171 /*
1172 1172 * Update the shared statistics file with each collection's current statistics.
1173 1173 * Return zero on success.
1174 1174 */
1175 1175 static int
1176 1176 update_statistics(void)
1177 1177 {
1178 1178 int fd, res;
1179 1179 static char template[LINELEN];
1180 1180
1181 1181 /*
1182 1182 * Try to create a directory irrespective of whether it is existing
1183 1183 * or not. If it is not there then it will create. Otherwise any way
1184 1184 * it will fail at mkstemp call below.
1185 1185 */
1186 1186 (void) mkdir(STAT_FILE_DIR, 0755);
1187 1187
1188 1188 /*
1189 1189 * Create a temporary file.
1190 1190 */
1191 1191 if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
1192 1192 strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
1193 1193 debug("temporary file template size too small\n");
1194 1194 return (-1);
1195 1195 }
1196 1196 (void) strcpy(template, rcfg.rcfg_stat_file);
1197 1197 (void) strcat(template, STAT_TEMPLATE_SUFFIX);
1198 1198 (void) rfd_reserve(1);
1199 1199 fd = mkstemp(template);
1200 1200
1201 1201 /*
1202 1202 * Write the header and per-collection statistics.
1203 1203 */
1204 1204 if (fd >= 0) {
1205 1205 rcapd_stat_hdr_t rs;
1206 1206
1207 1207 rs.rs_pid = rcapd_pid;
1208 1208 rs.rs_time = gethrtime();
1209 1209 ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
1210 1210 (void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
1211 1211 rs.rs_pressure_cur = memory_pressure;
1212 1212 rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
1213 1213 rs.rs_pressure_sample = memory_pressure_sample;
1214 1214
1215 1215 if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
1216 1216 sizeof (rs)) {
1217 1217 list_walk_collection(report_collection_cb,
1218 1218 (void *)(intptr_t)fd);
1219 1219 /*
1220 1220 * Replace the existing statistics file with this new
1221 1221 * one.
1222 1222 */
1223 1223 res = rename(template, rcfg.rcfg_stat_file);
1224 1224 } else
1225 1225 res = -1;
1226 1226 (void) close(fd);
1227 1227 } else
1228 1228 res = -1;
1229 1229
1230 1230 return (res);
1231 1231 }
1232 1232
1233 1233 /*
1234 1234 * Verify the statistics file can be created and written to, and die if an
1235 1235 * existing file may be in use by another rcapd.
1236 1236 */
1237 1237 static int
1238 1238 verify_statistics(void)
1239 1239 {
1240 1240 pid_t pid;
1241 1241
1242 1242 /*
1243 1243 * Warn if another instance of rcapd might be active.
1244 1244 */
1245 1245 (void) rfd_reserve(1);
1246 1246 pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
1247 1247 if (pid != rcapd_pid && pid != -1)
1248 1248 die(gettext("%s exists; rcapd may already be active\n"),
1249 1249 rcfg.rcfg_stat_file);
1250 1250
1251 1251 return (update_statistics());
1252 1252 }
1253 1253
1254 1254 static int
1255 1255 sum_excess_cb(lcollection_t *lcol, void *arg)
1256 1256 {
1257 1257 uint64_t *sum_excess = arg;
1258 1258
1259 1259 *sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
1260 1260 lcol->lcol_rss_cap));
1261 1261 return (0);
1262 1262 }
1263 1263
1264 1264 /*
1265 1265 * Compute the quantity of memory (in kilobytes) above the cap enforcement
1266 1266 * pressure. Set the scan goal to that quantity (or at most the excess).
1267 1267 */
1268 1268 static void
1269 1269 compute_soft_scan_goal(soft_scan_arg_t *argp)
1270 1270 {
1271 1271 /*
1272 1272 * Compute the sum of the collections' excesses, which will be the
1273 1273 * denominator.
1274 1274 */
1275 1275 argp->ssa_sum_excess = 0;
1276 1276 list_walk_collection(sum_excess_cb, &(argp->ssa_sum_excess));
1277 1277
1278 1278 argp->ssa_scan_goal = MIN((sysconf(_SC_PHYS_PAGES) *
1279 1279 (100 - rcfg.rcfg_memory_cap_enforcement_pressure) / 100 -
1280 1280 sysconf(_SC_AVPHYS_PAGES)) * page_size_kb,
1281 1281 argp->ssa_sum_excess);
1282 1282 }
1283 1283
1284 1284 static void
1285 1285 rcapd_usage(void)
1286 1286 {
1287 1287 info(gettext("usage: rcapd [-d]\n"));
1288 1288 }
1289 1289
1290 1290 void
1291 1291 check_update_statistics(void)
1292 1292 {
1293 1293 hrtime_t now = gethrtime();
1294 1294
1295 1295 if (EVENT_TIME(now, next_report)) {
1296 1296 debug("updating statistics...\n");
1297 1297 list_walk_collection(simple_report_collection_cb, NULL);
1298 1298 if (update_statistics() != 0)
1299 1299 debug("couldn't update statistics");
1300 1300 next_report = NEXT_REPORT_EVENT_TIME(now,
1301 1301 rcfg.rcfg_report_interval);
1302 1302 }
1303 1303 }
1304 1304
1305 1305 static void
1306 1306 verify_and_set_privileges(void)
1307 1307 {
1308 1308 priv_set_t *required =
1309 1309 priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
1310 1310
1311 1311 /*
1312 1312 * Ensure the required privileges, suitable for controlling processes,
1313 1313 * are possessed.
1314 1314 */
1315 1315 if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
1316 1316 PRIV_SET, PRIV_EFFECTIVE, required) != 0)
1317 1317 die(gettext("can't set requisite privileges"));
1318 1318
1319 1319 /*
1320 1320 * Ensure access to /var/run/daemon.
1321 1321 */
1322 1322 if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
1323 1323 die(gettext("cannot become user daemon"));
1324 1324
1325 1325 priv_freeset(required);
1326 1326 }
1327 1327
1328 1328 /*
1329 1329 * This function does the top-level work to determine if we should do any
1330 1330 * memory capping, and if so, it invokes the right call-backs to do the work.
1331 1331 */
1332 1332 static void
1333 1333 do_capping(hrtime_t now, hrtime_t *next_proc_walk)
1334 1334 {
1335 1335 boolean_t enforce_caps;
1336 1336 /* soft cap enforcement flag, depending on memory pressure */
1337 1337 boolean_t enforce_soft_caps;
1338 1338 /* avoid interference with kernel's page scanner */
1339 1339 boolean_t global_scanner_running;
1340 1340 sample_col_arg_t col_arg;
1341 1341 soft_scan_arg_t arg;
1342 1342 uint_t col_types = 0;
1343 1343
1344 1344 /* check what kind of collections (project/zone) are capped */
1345 1345 list_walk_collection(col_type_cb, &col_types);
1346 1346 debug("collection types: 0x%x\n", col_types);
1347 1347
1348 1348 /* no capped collections, skip checking rss */
1349 1349 if (col_types == 0)
1350 1350 return;
1351 1351
1352 1352 /* Determine if soft caps are enforced. */
1353 1353 enforce_soft_caps = must_enforce_soft_caps();
1354 1354
1355 1355 /* Determine if the global page scanner is running. */
1356 1356 global_scanner_running = is_global_scanner_running();
1357 1357
1358 1358 /*
1359 1359 * Sample collections' member processes RSSes and recompute
1360 1360 * collections' excess.
1361 1361 */
1362 1362 rss_sample(B_FALSE, col_types);
1363 1363
1364 1364 col_arg.sca_any_over_cap = B_FALSE;
1365 1365 col_arg.sca_project_over_cap = B_FALSE;
1366 1366 list_walk_collection(rss_sample_col_cb, &col_arg);
1367 1367 list_walk_collection(excess_print_cb, NULL);
1368 1368 debug("any collection/project over cap = %d, %d\n",
1369 1369 col_arg.sca_any_over_cap, col_arg.sca_project_over_cap);
1370 1370
1371 1371 if (enforce_soft_caps)
1372 1372 debug("memory pressure %d%%\n", memory_pressure);
1373 1373
1374 1374 /*
1375 1375 * Cap enforcement is determined by the previous conditions.
1376 1376 */
1377 1377 enforce_caps = !global_scanner_running && col_arg.sca_any_over_cap &&
1378 1378 (rcfg.rcfg_memory_cap_enforcement_pressure == 0 ||
1379 1379 enforce_soft_caps);
1380 1380
1381 1381 debug("%senforcing caps\n", enforce_caps ? "" : "not ");
1382 1382
1383 1383 /*
1384 1384 * If soft caps are in use, determine the size of the portion from each
1385 1385 * collection to scan for.
1386 1386 */
1387 1387 if (enforce_caps && enforce_soft_caps)
1388 1388 compute_soft_scan_goal(&arg);
1389 1389
1390 1390 /*
1391 1391 * Victimize offending collections.
1392 1392 */
1393 1393 if (enforce_caps && (!enforce_soft_caps ||
1394 1394 (arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0))) {
1395 1395
1396 1396 /*
1397 1397 * Since at least one collection is over its cap & needs
1398 1398 * enforcing, check if it is at least time for a process walk
1399 1399 * (we could be well past time since we only walk /proc when
1400 1400 * we need to) and if so, update each collections process list
1401 1401 * in a single pass through /proc.
1402 1402 */
1403 1403 if (EVENT_TIME(now, *next_proc_walk)) {
1404 1404 debug("scanning process list...\n");
1405 1405 proc_walk_all(proc_cb); /* insert & mark */
1406 1406 list_walk_all(sweep_process_cb); /* free dead procs */
1407 1407 *next_proc_walk = NEXT_EVENT_TIME(now,
1408 1408 rcfg.rcfg_proc_walk_interval);
1409 1409 }
1410 1410
1411 1411 gz_col = NULL;
1412 1412 if (enforce_soft_caps) {
1413 1413 debug("scan goal is %lldKB\n",
1414 1414 (long long)arg.ssa_scan_goal);
1415 1415 list_walk_collection(soft_scan_cb, &arg);
1416 1416 if (gz_capped && gz_col != NULL) {
1417 1417 /* process global zone */
1418 1418 arg.ssa_project_over_cap =
1419 1419 col_arg.sca_project_over_cap;
1420 1420 soft_scan_gz(gz_col, &arg);
1421 1421 }
1422 1422 } else {
1423 1423 list_walk_collection(scan_cb, NULL);
1424 1424 if (gz_capped && gz_col != NULL) {
1425 1425 /* process global zone */
1426 1426 scan_gz(gz_col, col_arg.sca_project_over_cap);
1427 1427 }
1428 1428 }
1429 1429 } else if (col_arg.sca_any_over_cap) {
1430 1430 list_walk_collection(unenforced_cap_cb, NULL);
1431 1431 }
1432 1432 }
1433 1433
1434 1434 int
1435 1435 main(int argc, char *argv[])
1436 1436 {
1437 1437 int res;
1438 1438 int should_fork = 1; /* fork flag */
1439 1439 hrtime_t now; /* current time */
1440 1440 hrtime_t next; /* time of next event */
1441 1441 int sig; /* signal iteration */
1442 1442 struct rlimit rl;
1443 1443 hrtime_t next_proc_walk; /* time of next /proc scan */
1444 1444 hrtime_t next_configuration; /* time of next configuration */
1445 1445 hrtime_t next_rss_sample; /* (latest) time of next RSS sample */
1446 1446
1447 1447 (void) set_message_priority(RCM_INFO);
1448 1448 (void) setpname("rcapd");
1449 1449 rcapd_pid = getpid();
1450 1450 (void) chdir("/");
1451 1451 should_run = 1;
1452 1452 ever_ran = 0;
1453 1453
1454 1454 (void) setlocale(LC_ALL, "");
1455 1455 (void) textdomain(TEXT_DOMAIN);
1456 1456
1457 1457 /*
1458 1458 * Parse command-line options.
1459 1459 */
1460 1460 while ((res = getopt(argc, argv, "dF")) > 0)
1461 1461 switch (res) {
1462 1462 case 'd':
1463 1463 should_fork = 0;
1464 1464 if (debug_mode == 0) {
1465 1465 debug_mode = 1;
1466 1466 (void) set_message_priority(RCM_DEBUG);
1467 1467 } else
1468 1468 (void) set_message_priority(RCM_DEBUG_HIGH);
1469 1469 break;
1470 1470 case 'F':
1471 1471 should_fork = 0;
1472 1472 break;
1473 1473 default:
1474 1474 rcapd_usage();
1475 1475 return (E_USAGE);
1476 1476 /*NOTREACHED*/
1477 1477 }
1478 1478
1479 1479 /*
1480 1480 * Read the configuration.
1481 1481 */
1482 1482 if (rcfg_read(&rcfg, verify_statistics) != E_SUCCESS) {
1483 1483 warn(gettext("resource caps not configured\n"));
1484 1484 return (SMF_EXIT_ERR_CONFIG);
1485 1485 }
1486 1486
1487 1487 /*
1488 1488 * If not debugging, fork and continue operating, changing the
1489 1489 * destination of messages to syslog().
1490 1490 */
1491 1491 if (should_fork == 1) {
1492 1492 pid_t child;
1493 1493 debug("forking\n");
1494 1494 child = fork();
1495 1495 if (child == -1)
1496 1496 die(gettext("cannot fork"));
1497 1497 if (child > 0)
1498 1498 return (0);
1499 1499 else {
1500 1500 rcapd_pid = getpid();
1501 1501 (void) set_message_destination(RCD_SYSLOG);
1502 1502 (void) fclose(stdin);
1503 1503 (void) fclose(stdout);
1504 1504 (void) fclose(stderr);
1505 1505 }
1506 1506 /*
1507 1507 * Start a new session and detatch from the controlling tty.
1508 1508 */
1509 1509 if (setsid() == (pid_t)-1)
1510 1510 debug(gettext("setsid() failed; cannot detach from "
1511 1511 "terminal"));
1512 1512 }
1513 1513
1514 1514 finish_configuration();
1515 1515 should_reconfigure = 0;
1516 1516
1517 1517 /*
1518 1518 * Check that required privileges are possessed.
1519 1519 */
1520 1520 verify_and_set_privileges();
1521 1521
1522 1522 now = next_report = next_proc_walk = next_rss_sample = gethrtime();
1523 1523 next_configuration = NEXT_EVENT_TIME(gethrtime(),
1524 1524 rcfg.rcfg_reconfiguration_interval);
1525 1525
1526 1526 /*
1527 1527 * Open the kstat chain.
1528 1528 */
1529 1529 kctl = kstat_open();
1530 1530 if (kctl == NULL)
1531 1531 die(gettext("can't open kstats"));
1532 1532
1533 1533 /*
1534 1534 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
1535 1535 * be effectively managed without revoking descriptors (at 3 per
1536 1536 * process).
1537 1537 */
1538 1538 rl.rlim_cur = 32 * 1024;
1539 1539 rl.rlim_max = 32 * 1024;
1540 1540 if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
1541 1541 getrlimit(RLIMIT_NOFILE, &rl) == 0) {
1542 1542 rl.rlim_cur = rl.rlim_max;
1543 1543 (void) setrlimit(RLIMIT_NOFILE, &rl);
1544 1544 }
1545 1545 (void) enable_extended_FILE_stdio(-1, -1);
1546 1546
1547 1547 if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
1548 1548 debug("fd limit: %lu\n", rl.rlim_cur);
1549 1549 else
1550 1550 debug("fd limit: unknown\n");
1551 1551
1552 1552 get_page_size();
1553 1553 my_zoneid = getzoneid();
1554 1554
1555 1555 /*
1556 1556 * Handle those signals whose (default) exit disposition
1557 1557 * prevents rcapd from finishing scanning before terminating.
1558 1558 */
1559 1559 (void) sigset(SIGINT, terminate_signal);
1560 1560 (void) sigset(SIGQUIT, abort_signal);
1561 1561 (void) sigset(SIGILL, abort_signal);
1562 1562 (void) sigset(SIGEMT, abort_signal);
1563 1563 (void) sigset(SIGFPE, abort_signal);
1564 1564 (void) sigset(SIGBUS, abort_signal);
1565 1565 (void) sigset(SIGSEGV, abort_signal);
1566 1566 (void) sigset(SIGSYS, abort_signal);
1567 1567 (void) sigset(SIGPIPE, terminate_signal);
1568 1568 (void) sigset(SIGALRM, terminate_signal);
1569 1569 (void) sigset(SIGTERM, terminate_signal);
1570 1570 (void) sigset(SIGUSR1, terminate_signal);
1571 1571 (void) sigset(SIGUSR2, terminate_signal);
1572 1572 (void) sigset(SIGPOLL, terminate_signal);
1573 1573 (void) sigset(SIGVTALRM, terminate_signal);
1574 1574 (void) sigset(SIGXCPU, abort_signal);
1575 1575 (void) sigset(SIGXFSZ, abort_signal);
1576 1576 for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
1577 1577 (void) sigset(sig, terminate_signal);
1578 1578
1579 1579 /*
1580 1580 * Install a signal handler for reconfiguration processing.
1581 1581 */
1582 1582 (void) sigset(SIGHUP, sighup);
1583 1583
1584 1584 /*
1585 1585 * Determine which process collections to cap.
1586 1586 */
1587 1587 lcollection_update(LCU_COMPLETE);
1588 1588
1589 1589 /*
1590 1590 * Loop forever, monitoring collections' resident set sizes and
1591 1591 * enforcing their caps. Look for changes in caps as well as
1592 1592 * responding to requests to reread the configuration. Update
1593 1593 * per-collection statistics periodically.
1594 1594 */
1595 1595 while (should_run != 0) {
1596 1596 struct timespec ts;
1597 1597
1598 1598 /*
1599 1599 * Announce that rcapd is starting.
1600 1600 */
1601 1601 if (ever_ran == 0) {
1602 1602 info(gettext("starting\n"));
1603 1603 ever_ran = 1;
1604 1604 }
1605 1605
1606 1606 /*
1607 1607 * Check the configuration at every next_configuration interval.
1608 1608 * Update the rss data once every next_rss_sample interval.
1609 1609 * The condition of global memory pressure is also checked at
1610 1610 * the same frequency, if strict caps are in use.
1611 1611 */
1612 1612 now = gethrtime();
1613 1613
1614 1614 /*
1615 1615 * Detect configuration and cap changes only when SIGHUP
1616 1616 * is received. Call reconfigure to apply new configuration
1617 1617 * parameters.
1618 1618 */
1619 1619 if (should_reconfigure == 1) {
1620 1620 reread_configuration();
1621 1621 should_reconfigure = 0;
1622 1622 reconfigure(now, &next_configuration, &next_proc_walk,
1623 1623 &next_rss_sample);
1624 1624 }
1625 1625
1626 1626 if (EVENT_TIME(now, next_configuration)) {
1627 1627 reconfigure(now, &next_configuration, &next_proc_walk,
1628 1628 &next_rss_sample);
1629 1629 }
1630 1630
1631 1631 /*
1632 1632 * Do the main work for enforcing caps.
1633 1633 */
1634 1634 if (EVENT_TIME(now, next_rss_sample)) {
1635 1635 do_capping(now, &next_proc_walk);
1636 1636
1637 1637 next_rss_sample = NEXT_EVENT_TIME(now,
1638 1638 rcfg.rcfg_rss_sample_interval);
1639 1639 }
1640 1640
1641 1641 /*
1642 1642 * Update the statistics file, if it's time.
1643 1643 */
1644 1644 check_update_statistics();
1645 1645
1646 1646 /*
1647 1647 * Sleep for some time before repeating.
1648 1648 */
1649 1649 now = gethrtime();
1650 1650 next = next_configuration;
1651 1651 next = POSITIVE_MIN(next, next_report);
1652 1652 next = POSITIVE_MIN(next, next_rss_sample);
1653 1653 if (next > now && should_run != 0) {
1654 1654 debug("sleeping %-4.2f seconds\n", (float)(next -
1655 1655 now) / (float)NANOSEC);
1656 1656 hrt2ts(next - now, &ts);
1657 1657 (void) nanosleep(&ts, NULL);
1658 1658 }
1659 1659 }
1660 1660 if (termination_signal != 0)
1661 1661 debug("exiting due to signal %d\n", termination_signal);
1662 1662 if (ever_ran != 0)
1663 1663 info(gettext("exiting\n"));
1664 1664
1665 1665 /*
1666 1666 * Unlink the statistics file before exiting.
1667 1667 */
1668 1668 if (rcfg.rcfg_stat_file[0] != 0)
1669 1669 (void) unlink(rcfg.rcfg_stat_file);
1670 1670
1671 1671 return (E_SUCCESS);
1672 1672 }
↓ open down ↓ |
613 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX