1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Martin Matuska. All rights reserved. 25 * Copyright (c) 2014 Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 #include <sys/dmu.h> 31 #include <sys/dmu_objset.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dsl_prop.h> 36 #include <sys/dsl_synctask.h> 37 #include <sys/dsl_deleg.h> 38 #include <sys/dmu_impl.h> 39 #include <sys/spa.h> 40 #include <sys/metaslab.h> 41 #include <sys/zap.h> 42 #include <sys/zio.h> 43 #include <sys/arc.h> 44 #include <sys/sunddi.h> 45 #include <sys/zfeature.h> 46 #include <sys/policy.h> 47 #include <sys/zfs_znode.h> 48 #include "zfs_namecheck.h" 49 #include "zfs_prop.h" 50 51 /* 52 * Filesystem and Snapshot Limits 53 * ------------------------------ 54 * 55 * These limits are used to restrict the number of filesystems and/or snapshots 56 * that can be created at a given level in the tree or below. A typical 57 * use-case is with a delegated dataset where the administrator wants to ensure 58 * that a user within the zone is not creating too many additional filesystems 59 * or snapshots, even though they're not exceeding their space quota. 60 * 61 * The filesystem and snapshot counts are stored as extensible properties. This 62 * capability is controlled by a feature flag and must be enabled to be used. 63 * Once enabled, the feature is not active until the first limit is set. At 64 * that point, future operations to create/destroy filesystems or snapshots 65 * will validate and update the counts. 66 * 67 * Because the count properties will not exist before the feature is active, 68 * the counts are updated when a limit is first set on an uninitialized 69 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes 70 * all of the nested filesystems/snapshots. Thus, a new leaf node has a 71 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and 72 * snapshot count properties on a node indicate uninitialized counts on that 73 * node.) When first setting a limit on an uninitialized node, the code starts 74 * at the filesystem with the new limit and descends into all sub-filesystems 75 * to add the count properties. 76 * 77 * In practice this is lightweight since a limit is typically set when the 78 * filesystem is created and thus has no children. Once valid, changing the 79 * limit value won't require a re-traversal since the counts are already valid. 80 * When recursively fixing the counts, if a node with a limit is encountered 81 * during the descent, the counts are known to be valid and there is no need to 82 * descend into that filesystem's children. The counts on filesystems above the 83 * one with the new limit will still be uninitialized, unless a limit is 84 * eventually set on one of those filesystems. The counts are always recursively 85 * updated when a limit is set on a dataset, unless there is already a limit. 86 * When a new limit value is set on a filesystem with an existing limit, it is 87 * possible for the new limit to be less than the current count at that level 88 * since a user who can change the limit is also allowed to exceed the limit. 89 * 90 * Once the feature is active, then whenever a filesystem or snapshot is 91 * created, the code recurses up the tree, validating the new count against the 92 * limit at each initialized level. In practice, most levels will not have a 93 * limit set. If there is a limit at any initialized level up the tree, the 94 * check must pass or the creation will fail. Likewise, when a filesystem or 95 * snapshot is destroyed, the counts are recursively adjusted all the way up 96 * the initizized nodes in the tree. Renaming a filesystem into different point 97 * in the tree will first validate, then update the counts on each branch up to 98 * the common ancestor. A receive will also validate the counts and then update 99 * them. 100 * 101 * An exception to the above behavior is that the limit is not enforced if the 102 * user has permission to modify the limit. This is primarily so that 103 * recursive snapshots in the global zone always work. We want to prevent a 104 * denial-of-service in which a lower level delegated dataset could max out its 105 * limit and thus block recursive snapshots from being taken in the global zone. 106 * Because of this, it is possible for the snapshot count to be over the limit 107 * and snapshots taken in the global zone could cause a lower level dataset to 108 * hit or exceed its limit. The administrator taking the global zone recursive 109 * snapshot should be aware of this side-effect and behave accordingly. 110 * For consistency, the filesystem limit is also not enforced if the user can 111 * modify the limit. 112 * 113 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() 114 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in 115 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by 116 * dsl_dir_init_fs_ss_count(). 117 * 118 * There is a special case when we receive a filesystem that already exists. In 119 * this case a temporary clone name of %X is created (see dmu_recv_begin). We 120 * never update the filesystem counts for temporary clones. 121 * 122 * Likewise, we do not update the snapshot counts for temporary snapshots, 123 * such as those created by zfs diff. 124 */ 125 126 extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); 127 128 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); 129 130 static void 131 dsl_dir_evict(void *dbu) 132 { 133 dsl_dir_t *dd = dbu; 134 dsl_pool_t *dp = dd->dd_pool; 135 int t; 136 137 dd->dd_dbuf = NULL; 138 139 for (t = 0; t < TXG_SIZE; t++) { 140 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 141 ASSERT(dd->dd_tempreserved[t] == 0); 142 ASSERT(dd->dd_space_towrite[t] == 0); 143 } 144 145 if (dd->dd_parent) 146 dsl_dir_async_rele(dd->dd_parent, dd); 147 148 spa_async_close(dd->dd_pool->dp_spa, dd); 149 150 dsl_prop_fini(dd); 151 mutex_destroy(&dd->dd_lock); 152 kmem_free(dd, sizeof (dsl_dir_t)); 153 } 154 155 int 156 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, 157 const char *tail, void *tag, dsl_dir_t **ddp) 158 { 159 dmu_buf_t *dbuf; 160 dsl_dir_t *dd; 161 int err; 162 163 ASSERT(dsl_pool_config_held(dp)); 164 165 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); 166 if (err != 0) 167 return (err); 168 dd = dmu_buf_get_user(dbuf); 169 #ifdef ZFS_DEBUG 170 { 171 dmu_object_info_t doi; 172 dmu_object_info_from_db(dbuf, &doi); 173 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); 174 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); 175 } 176 #endif 177 if (dd == NULL) { 178 dsl_dir_t *winner; 179 180 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 181 dd->dd_object = ddobj; 182 dd->dd_dbuf = dbuf; 183 dd->dd_pool = dp; 184 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); 185 dsl_prop_init(dd); 186 187 dsl_dir_snap_cmtime_update(dd); 188 189 if (dsl_dir_phys(dd)->dd_parent_obj) { 190 err = dsl_dir_hold_obj(dp, 191 dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, 192 &dd->dd_parent); 193 if (err != 0) 194 goto errout; 195 if (tail) { 196 #ifdef ZFS_DEBUG 197 uint64_t foundobj; 198 199 err = zap_lookup(dp->dp_meta_objset, 200 dsl_dir_phys(dd->dd_parent)-> 201 dd_child_dir_zapobj, tail, 202 sizeof (foundobj), 1, &foundobj); 203 ASSERT(err || foundobj == ddobj); 204 #endif 205 (void) strcpy(dd->dd_myname, tail); 206 } else { 207 err = zap_value_search(dp->dp_meta_objset, 208 dsl_dir_phys(dd->dd_parent)-> 209 dd_child_dir_zapobj, 210 ddobj, 0, dd->dd_myname); 211 } 212 if (err != 0) 213 goto errout; 214 } else { 215 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); 216 } 217 218 if (dsl_dir_is_clone(dd)) { 219 dmu_buf_t *origin_bonus; 220 dsl_dataset_phys_t *origin_phys; 221 222 /* 223 * We can't open the origin dataset, because 224 * that would require opening this dsl_dir. 225 * Just look at its phys directly instead. 226 */ 227 err = dmu_bonus_hold(dp->dp_meta_objset, 228 dsl_dir_phys(dd)->dd_origin_obj, FTAG, 229 &origin_bonus); 230 if (err != 0) 231 goto errout; 232 origin_phys = origin_bonus->db_data; 233 dd->dd_origin_txg = 234 origin_phys->ds_creation_txg; 235 dmu_buf_rele(origin_bonus, FTAG); 236 } 237 238 dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict, 239 &dd->dd_dbuf); 240 winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); 241 if (winner != NULL) { 242 if (dd->dd_parent) 243 dsl_dir_rele(dd->dd_parent, dd); 244 dsl_prop_fini(dd); 245 mutex_destroy(&dd->dd_lock); 246 kmem_free(dd, sizeof (dsl_dir_t)); 247 dd = winner; 248 } else { 249 spa_open_ref(dp->dp_spa, dd); 250 } 251 } 252 253 /* 254 * The dsl_dir_t has both open-to-close and instantiate-to-evict 255 * holds on the spa. We need the open-to-close holds because 256 * otherwise the spa_refcnt wouldn't change when we open a 257 * dir which the spa also has open, so we could incorrectly 258 * think it was OK to unload/export/destroy the pool. We need 259 * the instantiate-to-evict hold because the dsl_dir_t has a 260 * pointer to the dd_pool, which has a pointer to the spa_t. 261 */ 262 spa_open_ref(dp->dp_spa, tag); 263 ASSERT3P(dd->dd_pool, ==, dp); 264 ASSERT3U(dd->dd_object, ==, ddobj); 265 ASSERT3P(dd->dd_dbuf, ==, dbuf); 266 *ddp = dd; 267 return (0); 268 269 errout: 270 if (dd->dd_parent) 271 dsl_dir_rele(dd->dd_parent, dd); 272 dsl_prop_fini(dd); 273 mutex_destroy(&dd->dd_lock); 274 kmem_free(dd, sizeof (dsl_dir_t)); 275 dmu_buf_rele(dbuf, tag); 276 return (err); 277 } 278 279 void 280 dsl_dir_rele(dsl_dir_t *dd, void *tag) 281 { 282 dprintf_dd(dd, "%s\n", ""); 283 spa_close(dd->dd_pool->dp_spa, tag); 284 dmu_buf_rele(dd->dd_dbuf, tag); 285 } 286 287 /* 288 * Remove a reference to the given dsl dir that is being asynchronously 289 * released. Async releases occur from a taskq performing eviction of 290 * dsl datasets and dirs. This process is identical to a normal release 291 * with the exception of using the async API for releasing the reference on 292 * the spa. 293 */ 294 void 295 dsl_dir_async_rele(dsl_dir_t *dd, void *tag) 296 { 297 dprintf_dd(dd, "%s\n", ""); 298 spa_async_close(dd->dd_pool->dp_spa, tag); 299 dmu_buf_rele(dd->dd_dbuf, tag); 300 } 301 302 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ 303 void 304 dsl_dir_name(dsl_dir_t *dd, char *buf) 305 { 306 if (dd->dd_parent) { 307 dsl_dir_name(dd->dd_parent, buf); 308 (void) strcat(buf, "/"); 309 } else { 310 buf[0] = '\0'; 311 } 312 if (!MUTEX_HELD(&dd->dd_lock)) { 313 /* 314 * recursive mutex so that we can use 315 * dprintf_dd() with dd_lock held 316 */ 317 mutex_enter(&dd->dd_lock); 318 (void) strcat(buf, dd->dd_myname); 319 mutex_exit(&dd->dd_lock); 320 } else { 321 (void) strcat(buf, dd->dd_myname); 322 } 323 } 324 325 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ 326 int 327 dsl_dir_namelen(dsl_dir_t *dd) 328 { 329 int result = 0; 330 331 if (dd->dd_parent) { 332 /* parent's name + 1 for the "/" */ 333 result = dsl_dir_namelen(dd->dd_parent) + 1; 334 } 335 336 if (!MUTEX_HELD(&dd->dd_lock)) { 337 /* see dsl_dir_name */ 338 mutex_enter(&dd->dd_lock); 339 result += strlen(dd->dd_myname); 340 mutex_exit(&dd->dd_lock); 341 } else { 342 result += strlen(dd->dd_myname); 343 } 344 345 return (result); 346 } 347 348 static int 349 getcomponent(const char *path, char *component, const char **nextp) 350 { 351 char *p; 352 353 if ((path == NULL) || (path[0] == '\0')) 354 return (SET_ERROR(ENOENT)); 355 /* This would be a good place to reserve some namespace... */ 356 p = strpbrk(path, "/@"); 357 if (p && (p[1] == '/' || p[1] == '@')) { 358 /* two separators in a row */ 359 return (SET_ERROR(EINVAL)); 360 } 361 if (p == NULL || p == path) { 362 /* 363 * if the first thing is an @ or /, it had better be an 364 * @ and it had better not have any more ats or slashes, 365 * and it had better have something after the @. 366 */ 367 if (p != NULL && 368 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 369 return (SET_ERROR(EINVAL)); 370 if (strlen(path) >= MAXNAMELEN) 371 return (SET_ERROR(ENAMETOOLONG)); 372 (void) strcpy(component, path); 373 p = NULL; 374 } else if (p[0] == '/') { 375 if (p - path >= MAXNAMELEN) 376 return (SET_ERROR(ENAMETOOLONG)); 377 (void) strncpy(component, path, p - path); 378 component[p - path] = '\0'; 379 p++; 380 } else if (p[0] == '@') { 381 /* 382 * if the next separator is an @, there better not be 383 * any more slashes. 384 */ 385 if (strchr(path, '/')) 386 return (SET_ERROR(EINVAL)); 387 if (p - path >= MAXNAMELEN) 388 return (SET_ERROR(ENAMETOOLONG)); 389 (void) strncpy(component, path, p - path); 390 component[p - path] = '\0'; 391 } else { 392 panic("invalid p=%p", (void *)p); 393 } 394 *nextp = p; 395 return (0); 396 } 397 398 /* 399 * Return the dsl_dir_t, and possibly the last component which couldn't 400 * be found in *tail. The name must be in the specified dsl_pool_t. This 401 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the 402 * path is bogus, or if tail==NULL and we couldn't parse the whole name. 403 * (*tail)[0] == '@' means that the last component is a snapshot. 404 */ 405 int 406 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, 407 dsl_dir_t **ddp, const char **tailp) 408 { 409 char buf[MAXNAMELEN]; 410 const char *spaname, *next, *nextnext = NULL; 411 int err; 412 dsl_dir_t *dd; 413 uint64_t ddobj; 414 415 err = getcomponent(name, buf, &next); 416 if (err != 0) 417 return (err); 418 419 /* Make sure the name is in the specified pool. */ 420 spaname = spa_name(dp->dp_spa); 421 if (strcmp(buf, spaname) != 0) 422 return (SET_ERROR(EXDEV)); 423 424 ASSERT(dsl_pool_config_held(dp)); 425 426 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); 427 if (err != 0) { 428 return (err); 429 } 430 431 while (next != NULL) { 432 dsl_dir_t *child_dd; 433 err = getcomponent(next, buf, &nextnext); 434 if (err != 0) 435 break; 436 ASSERT(next[0] != '\0'); 437 if (next[0] == '@') 438 break; 439 dprintf("looking up %s in obj%lld\n", 440 buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); 441 442 err = zap_lookup(dp->dp_meta_objset, 443 dsl_dir_phys(dd)->dd_child_dir_zapobj, 444 buf, sizeof (ddobj), 1, &ddobj); 445 if (err != 0) { 446 if (err == ENOENT) 447 err = 0; 448 break; 449 } 450 451 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); 452 if (err != 0) 453 break; 454 dsl_dir_rele(dd, tag); 455 dd = child_dd; 456 next = nextnext; 457 } 458 459 if (err != 0) { 460 dsl_dir_rele(dd, tag); 461 return (err); 462 } 463 464 /* 465 * It's an error if there's more than one component left, or 466 * tailp==NULL and there's any component left. 467 */ 468 if (next != NULL && 469 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 470 /* bad path name */ 471 dsl_dir_rele(dd, tag); 472 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 473 err = SET_ERROR(ENOENT); 474 } 475 if (tailp != NULL) 476 *tailp = next; 477 *ddp = dd; 478 return (err); 479 } 480 481 /* 482 * If the counts are already initialized for this filesystem and its 483 * descendants then do nothing, otherwise initialize the counts. 484 * 485 * The counts on this filesystem, and those below, may be uninitialized due to 486 * either the use of a pre-existing pool which did not support the 487 * filesystem/snapshot limit feature, or one in which the feature had not yet 488 * been enabled. 489 * 490 * Recursively descend the filesystem tree and update the filesystem/snapshot 491 * counts on each filesystem below, then update the cumulative count on the 492 * current filesystem. If the filesystem already has a count set on it, 493 * then we know that its counts, and the counts on the filesystems below it, 494 * are already correct, so we don't have to update this filesystem. 495 */ 496 static void 497 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) 498 { 499 uint64_t my_fs_cnt = 0; 500 uint64_t my_ss_cnt = 0; 501 dsl_pool_t *dp = dd->dd_pool; 502 objset_t *os = dp->dp_meta_objset; 503 zap_cursor_t *zc; 504 zap_attribute_t *za; 505 dsl_dataset_t *ds; 506 507 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); 508 ASSERT(dsl_pool_config_held(dp)); 509 ASSERT(dmu_tx_is_syncing(tx)); 510 511 dsl_dir_zapify(dd, tx); 512 513 /* 514 * If the filesystem count has already been initialized then we 515 * don't need to recurse down any further. 516 */ 517 if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) 518 return; 519 520 zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); 521 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 522 523 /* Iterate my child dirs */ 524 for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); 525 zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { 526 dsl_dir_t *chld_dd; 527 uint64_t count; 528 529 VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, 530 &chld_dd)); 531 532 /* 533 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and 534 * temporary datasets. 535 */ 536 if (chld_dd->dd_myname[0] == '$' || 537 chld_dd->dd_myname[0] == '%') { 538 dsl_dir_rele(chld_dd, FTAG); 539 continue; 540 } 541 542 my_fs_cnt++; /* count this child */ 543 544 dsl_dir_init_fs_ss_count(chld_dd, tx); 545 546 VERIFY0(zap_lookup(os, chld_dd->dd_object, 547 DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); 548 my_fs_cnt += count; 549 VERIFY0(zap_lookup(os, chld_dd->dd_object, 550 DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); 551 my_ss_cnt += count; 552 553 dsl_dir_rele(chld_dd, FTAG); 554 } 555 zap_cursor_fini(zc); 556 /* Count my snapshots (we counted children's snapshots above) */ 557 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 558 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); 559 560 for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); 561 zap_cursor_retrieve(zc, za) == 0; 562 zap_cursor_advance(zc)) { 563 /* Don't count temporary snapshots */ 564 if (za->za_name[0] != '%') 565 my_ss_cnt++; 566 } 567 zap_cursor_fini(zc); 568 569 dsl_dataset_rele(ds, FTAG); 570 571 kmem_free(zc, sizeof (zap_cursor_t)); 572 kmem_free(za, sizeof (zap_attribute_t)); 573 574 /* we're in a sync task, update counts */ 575 dmu_buf_will_dirty(dd->dd_dbuf, tx); 576 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 577 sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); 578 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 579 sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); 580 } 581 582 static int 583 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) 584 { 585 char *ddname = (char *)arg; 586 dsl_pool_t *dp = dmu_tx_pool(tx); 587 dsl_dataset_t *ds; 588 dsl_dir_t *dd; 589 int error; 590 591 error = dsl_dataset_hold(dp, ddname, FTAG, &ds); 592 if (error != 0) 593 return (error); 594 595 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { 596 dsl_dataset_rele(ds, FTAG); 597 return (SET_ERROR(ENOTSUP)); 598 } 599 600 dd = ds->ds_dir; 601 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && 602 dsl_dir_is_zapified(dd) && 603 zap_contains(dp->dp_meta_objset, dd->dd_object, 604 DD_FIELD_FILESYSTEM_COUNT) == 0) { 605 dsl_dataset_rele(ds, FTAG); 606 return (SET_ERROR(EALREADY)); 607 } 608 609 dsl_dataset_rele(ds, FTAG); 610 return (0); 611 } 612 613 static void 614 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) 615 { 616 char *ddname = (char *)arg; 617 dsl_pool_t *dp = dmu_tx_pool(tx); 618 dsl_dataset_t *ds; 619 spa_t *spa; 620 621 VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); 622 623 spa = dsl_dataset_get_spa(ds); 624 625 if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { 626 /* 627 * Since the feature was not active and we're now setting a 628 * limit, increment the feature-active counter so that the 629 * feature becomes active for the first time. 630 * 631 * We are already in a sync task so we can update the MOS. 632 */ 633 spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); 634 } 635 636 /* 637 * Since we are now setting a non-UINT64_MAX limit on the filesystem, 638 * we need to ensure the counts are correct. Descend down the tree from 639 * this point and update all of the counts to be accurate. 640 */ 641 dsl_dir_init_fs_ss_count(ds->ds_dir, tx); 642 643 dsl_dataset_rele(ds, FTAG); 644 } 645 646 /* 647 * Make sure the feature is enabled and activate it if necessary. 648 * Since we're setting a limit, ensure the on-disk counts are valid. 649 * This is only called by the ioctl path when setting a limit value. 650 * 651 * We do not need to validate the new limit, since users who can change the 652 * limit are also allowed to exceed the limit. 653 */ 654 int 655 dsl_dir_activate_fs_ss_limit(const char *ddname) 656 { 657 int error; 658 659 error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, 660 dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, 661 ZFS_SPACE_CHECK_RESERVED); 662 663 if (error == EALREADY) 664 error = 0; 665 666 return (error); 667 } 668 669 /* 670 * Used to determine if the filesystem_limit or snapshot_limit should be 671 * enforced. We allow the limit to be exceeded if the user has permission to 672 * write the property value. We pass in the creds that we got in the open 673 * context since we will always be the GZ root in syncing context. We also have 674 * to handle the case where we are allowed to change the limit on the current 675 * dataset, but there may be another limit in the tree above. 676 * 677 * We can never modify these two properties within a non-global zone. In 678 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We 679 * can't use that function since we are already holding the dp_config_rwlock. 680 * In addition, we already have the dd and dealing with snapshots is simplified 681 * in this code. 682 */ 683 684 typedef enum { 685 ENFORCE_ALWAYS, 686 ENFORCE_NEVER, 687 ENFORCE_ABOVE 688 } enforce_res_t; 689 690 static enforce_res_t 691 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) 692 { 693 enforce_res_t enforce = ENFORCE_ALWAYS; 694 uint64_t obj; 695 dsl_dataset_t *ds; 696 uint64_t zoned; 697 698 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 699 prop == ZFS_PROP_SNAPSHOT_LIMIT); 700 701 #ifdef _KERNEL 702 if (crgetzoneid(cr) != GLOBAL_ZONEID) 703 return (ENFORCE_ALWAYS); 704 705 if (secpolicy_zfs(cr) == 0) 706 return (ENFORCE_NEVER); 707 #endif 708 709 if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) 710 return (ENFORCE_ALWAYS); 711 712 ASSERT(dsl_pool_config_held(dd->dd_pool)); 713 714 if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) 715 return (ENFORCE_ALWAYS); 716 717 if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { 718 /* Only root can access zoned fs's from the GZ */ 719 enforce = ENFORCE_ALWAYS; 720 } else { 721 if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) 722 enforce = ENFORCE_ABOVE; 723 } 724 725 dsl_dataset_rele(ds, FTAG); 726 return (enforce); 727 } 728 729 /* 730 * Check if adding additional child filesystem(s) would exceed any filesystem 731 * limits or adding additional snapshot(s) would exceed any snapshot limits. 732 * The prop argument indicates which limit to check. 733 * 734 * Note that all filesystem limits up to the root (or the highest 735 * initialized) filesystem or the given ancestor must be satisfied. 736 */ 737 int 738 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, 739 dsl_dir_t *ancestor, cred_t *cr) 740 { 741 objset_t *os = dd->dd_pool->dp_meta_objset; 742 uint64_t limit, count; 743 char *count_prop; 744 enforce_res_t enforce; 745 int err = 0; 746 747 ASSERT(dsl_pool_config_held(dd->dd_pool)); 748 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 749 prop == ZFS_PROP_SNAPSHOT_LIMIT); 750 751 /* 752 * If we're allowed to change the limit, don't enforce the limit 753 * e.g. this can happen if a snapshot is taken by an administrative 754 * user in the global zone (i.e. a recursive snapshot by root). 755 * However, we must handle the case of delegated permissions where we 756 * are allowed to change the limit on the current dataset, but there 757 * is another limit in the tree above. 758 */ 759 enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); 760 if (enforce == ENFORCE_NEVER) 761 return (0); 762 763 /* 764 * e.g. if renaming a dataset with no snapshots, count adjustment 765 * is 0. 766 */ 767 if (delta == 0) 768 return (0); 769 770 if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { 771 /* 772 * We don't enforce the limit for temporary snapshots. This is 773 * indicated by a NULL cred_t argument. 774 */ 775 if (cr == NULL) 776 return (0); 777 778 count_prop = DD_FIELD_SNAPSHOT_COUNT; 779 } else { 780 count_prop = DD_FIELD_FILESYSTEM_COUNT; 781 } 782 783 /* 784 * If an ancestor has been provided, stop checking the limit once we 785 * hit that dir. We need this during rename so that we don't overcount 786 * the check once we recurse up to the common ancestor. 787 */ 788 if (ancestor == dd) 789 return (0); 790 791 /* 792 * If we hit an uninitialized node while recursing up the tree, we can 793 * stop since we know there is no limit here (or above). The counts are 794 * not valid on this node and we know we won't touch this node's counts. 795 */ 796 if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, 797 count_prop, sizeof (count), 1, &count) == ENOENT) 798 return (0); 799 800 err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, 801 B_FALSE); 802 if (err != 0) 803 return (err); 804 805 /* Is there a limit which we've hit? */ 806 if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) 807 return (SET_ERROR(EDQUOT)); 808 809 if (dd->dd_parent != NULL) 810 err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, 811 ancestor, cr); 812 813 return (err); 814 } 815 816 /* 817 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all 818 * parents. When a new filesystem/snapshot is created, increment the count on 819 * all parents, and when a filesystem/snapshot is destroyed, decrement the 820 * count. 821 */ 822 void 823 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, 824 dmu_tx_t *tx) 825 { 826 int err; 827 objset_t *os = dd->dd_pool->dp_meta_objset; 828 uint64_t count; 829 830 ASSERT(dsl_pool_config_held(dd->dd_pool)); 831 ASSERT(dmu_tx_is_syncing(tx)); 832 ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || 833 strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); 834 835 /* 836 * When we receive an incremental stream into a filesystem that already 837 * exists, a temporary clone is created. We don't count this temporary 838 * clone, whose name begins with a '%'. We also ignore hidden ($FREE, 839 * $MOS & $ORIGIN) objsets. 840 */ 841 if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && 842 strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) 843 return; 844 845 /* 846 * e.g. if renaming a dataset with no snapshots, count adjustment is 0 847 */ 848 if (delta == 0) 849 return; 850 851 /* 852 * If we hit an uninitialized node while recursing up the tree, we can 853 * stop since we know the counts are not valid on this node and we 854 * know we shouldn't touch this node's counts. An uninitialized count 855 * on the node indicates that either the feature has not yet been 856 * activated or there are no limits on this part of the tree. 857 */ 858 if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, 859 prop, sizeof (count), 1, &count)) == ENOENT) 860 return; 861 VERIFY0(err); 862 863 count += delta; 864 /* Use a signed verify to make sure we're not neg. */ 865 VERIFY3S(count, >=, 0); 866 867 VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, 868 tx)); 869 870 /* Roll up this additional count into our ancestors */ 871 if (dd->dd_parent != NULL) 872 dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); 873 } 874 875 uint64_t 876 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, 877 dmu_tx_t *tx) 878 { 879 objset_t *mos = dp->dp_meta_objset; 880 uint64_t ddobj; 881 dsl_dir_phys_t *ddphys; 882 dmu_buf_t *dbuf; 883 884 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 885 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 886 if (pds) { 887 VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, 888 name, sizeof (uint64_t), 1, &ddobj, tx)); 889 } else { 890 /* it's the root dir */ 891 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, 892 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); 893 } 894 VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); 895 dmu_buf_will_dirty(dbuf, tx); 896 ddphys = dbuf->db_data; 897 898 ddphys->dd_creation_time = gethrestime_sec(); 899 if (pds) { 900 ddphys->dd_parent_obj = pds->dd_object; 901 902 /* update the filesystem counts */ 903 dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); 904 } 905 ddphys->dd_props_zapobj = zap_create(mos, 906 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 907 ddphys->dd_child_dir_zapobj = zap_create(mos, 908 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 909 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) 910 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; 911 dmu_buf_rele(dbuf, FTAG); 912 913 return (ddobj); 914 } 915 916 boolean_t 917 dsl_dir_is_clone(dsl_dir_t *dd) 918 { 919 return (dsl_dir_phys(dd)->dd_origin_obj && 920 (dd->dd_pool->dp_origin_snap == NULL || 921 dsl_dir_phys(dd)->dd_origin_obj != 922 dd->dd_pool->dp_origin_snap->ds_object)); 923 } 924 925 void 926 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) 927 { 928 mutex_enter(&dd->dd_lock); 929 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 930 dsl_dir_phys(dd)->dd_used_bytes); 931 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, 932 dsl_dir_phys(dd)->dd_quota); 933 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, 934 dsl_dir_phys(dd)->dd_reserved); 935 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, 936 dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : 937 (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / 938 dsl_dir_phys(dd)->dd_compressed_bytes)); 939 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, 940 dsl_dir_phys(dd)->dd_uncompressed_bytes); 941 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 942 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, 943 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); 944 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, 945 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); 946 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, 947 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); 948 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, 949 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + 950 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); 951 } 952 mutex_exit(&dd->dd_lock); 953 954 if (dsl_dir_is_zapified(dd)) { 955 uint64_t count; 956 objset_t *os = dd->dd_pool->dp_meta_objset; 957 958 if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 959 sizeof (count), 1, &count) == 0) { 960 dsl_prop_nvlist_add_uint64(nv, 961 ZFS_PROP_FILESYSTEM_COUNT, count); 962 } 963 if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 964 sizeof (count), 1, &count) == 0) { 965 dsl_prop_nvlist_add_uint64(nv, 966 ZFS_PROP_SNAPSHOT_COUNT, count); 967 } 968 } 969 970 if (dsl_dir_is_clone(dd)) { 971 dsl_dataset_t *ds; 972 char buf[MAXNAMELEN]; 973 974 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 975 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); 976 dsl_dataset_name(ds, buf); 977 dsl_dataset_rele(ds, FTAG); 978 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); 979 } 980 } 981 982 void 983 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 984 { 985 dsl_pool_t *dp = dd->dd_pool; 986 987 ASSERT(dsl_dir_phys(dd)); 988 989 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { 990 /* up the hold count until we can be written out */ 991 dmu_buf_add_ref(dd->dd_dbuf, dd); 992 } 993 } 994 995 static int64_t 996 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 997 { 998 uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); 999 uint64_t new_accounted = 1000 MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); 1001 return (new_accounted - old_accounted); 1002 } 1003 1004 void 1005 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 1006 { 1007 ASSERT(dmu_tx_is_syncing(tx)); 1008 1009 mutex_enter(&dd->dd_lock); 1010 ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); 1011 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, 1012 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); 1013 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; 1014 mutex_exit(&dd->dd_lock); 1015 1016 /* release the hold from dsl_dir_dirty */ 1017 dmu_buf_rele(dd->dd_dbuf, dd); 1018 } 1019 1020 static uint64_t 1021 dsl_dir_space_towrite(dsl_dir_t *dd) 1022 { 1023 uint64_t space = 0; 1024 int i; 1025 1026 ASSERT(MUTEX_HELD(&dd->dd_lock)); 1027 1028 for (i = 0; i < TXG_SIZE; i++) { 1029 space += dd->dd_space_towrite[i&TXG_MASK]; 1030 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); 1031 } 1032 return (space); 1033 } 1034 1035 /* 1036 * How much space would dd have available if ancestor had delta applied 1037 * to it? If ondiskonly is set, we're only interested in what's 1038 * on-disk, not estimated pending changes. 1039 */ 1040 uint64_t 1041 dsl_dir_space_available(dsl_dir_t *dd, 1042 dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 1043 { 1044 uint64_t parentspace, myspace, quota, used; 1045 1046 /* 1047 * If there are no restrictions otherwise, assume we have 1048 * unlimited space available. 1049 */ 1050 quota = UINT64_MAX; 1051 parentspace = UINT64_MAX; 1052 1053 if (dd->dd_parent != NULL) { 1054 parentspace = dsl_dir_space_available(dd->dd_parent, 1055 ancestor, delta, ondiskonly); 1056 } 1057 1058 mutex_enter(&dd->dd_lock); 1059 if (dsl_dir_phys(dd)->dd_quota != 0) 1060 quota = dsl_dir_phys(dd)->dd_quota; 1061 used = dsl_dir_phys(dd)->dd_used_bytes; 1062 if (!ondiskonly) 1063 used += dsl_dir_space_towrite(dd); 1064 1065 if (dd->dd_parent == NULL) { 1066 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); 1067 quota = MIN(quota, poolsize); 1068 } 1069 1070 if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { 1071 /* 1072 * We have some space reserved, in addition to what our 1073 * parent gave us. 1074 */ 1075 parentspace += dsl_dir_phys(dd)->dd_reserved - used; 1076 } 1077 1078 if (dd == ancestor) { 1079 ASSERT(delta <= 0); 1080 ASSERT(used >= -delta); 1081 used += delta; 1082 if (parentspace != UINT64_MAX) 1083 parentspace -= delta; 1084 } 1085 1086 if (used > quota) { 1087 /* over quota */ 1088 myspace = 0; 1089 } else { 1090 /* 1091 * the lesser of the space provided by our parent and 1092 * the space left in our quota 1093 */ 1094 myspace = MIN(parentspace, quota - used); 1095 } 1096 1097 mutex_exit(&dd->dd_lock); 1098 1099 return (myspace); 1100 } 1101 1102 struct tempreserve { 1103 list_node_t tr_node; 1104 dsl_dir_t *tr_ds; 1105 uint64_t tr_size; 1106 }; 1107 1108 static int 1109 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, 1110 boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, 1111 dmu_tx_t *tx, boolean_t first) 1112 { 1113 uint64_t txg = tx->tx_txg; 1114 uint64_t est_inflight, used_on_disk, quota, parent_rsrv; 1115 uint64_t deferred = 0; 1116 struct tempreserve *tr; 1117 int retval = EDQUOT; 1118 int txgidx = txg & TXG_MASK; 1119 int i; 1120 uint64_t ref_rsrv = 0; 1121 1122 ASSERT3U(txg, !=, 0); 1123 ASSERT3S(asize, >, 0); 1124 1125 mutex_enter(&dd->dd_lock); 1126 1127 /* 1128 * Check against the dsl_dir's quota. We don't add in the delta 1129 * when checking for over-quota because they get one free hit. 1130 */ 1131 est_inflight = dsl_dir_space_towrite(dd); 1132 for (i = 0; i < TXG_SIZE; i++) 1133 est_inflight += dd->dd_tempreserved[i]; 1134 used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; 1135 1136 /* 1137 * On the first iteration, fetch the dataset's used-on-disk and 1138 * refreservation values. Also, if checkrefquota is set, test if 1139 * allocating this space would exceed the dataset's refquota. 1140 */ 1141 if (first && tx->tx_objset) { 1142 int error; 1143 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; 1144 1145 error = dsl_dataset_check_quota(ds, checkrefquota, 1146 asize, est_inflight, &used_on_disk, &ref_rsrv); 1147 if (error) { 1148 mutex_exit(&dd->dd_lock); 1149 return (error); 1150 } 1151 } 1152 1153 /* 1154 * If this transaction will result in a net free of space, 1155 * we want to let it through. 1156 */ 1157 if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) 1158 quota = UINT64_MAX; 1159 else 1160 quota = dsl_dir_phys(dd)->dd_quota; 1161 1162 /* 1163 * Adjust the quota against the actual pool size at the root 1164 * minus any outstanding deferred frees. 1165 * To ensure that it's possible to remove files from a full 1166 * pool without inducing transient overcommits, we throttle 1167 * netfree transactions against a quota that is slightly larger, 1168 * but still within the pool's allocation slop. In cases where 1169 * we're very close to full, this will allow a steady trickle of 1170 * removes to get through. 1171 */ 1172 if (dd->dd_parent == NULL) { 1173 spa_t *spa = dd->dd_pool->dp_spa; 1174 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); 1175 deferred = metaslab_class_get_deferred(spa_normal_class(spa)); 1176 if (poolsize - deferred < quota) { 1177 quota = poolsize - deferred; 1178 retval = ENOSPC; 1179 } 1180 } 1181 1182 /* 1183 * If they are requesting more space, and our current estimate 1184 * is over quota, they get to try again unless the actual 1185 * on-disk is over quota and there are no pending changes (which 1186 * may free up space for us). 1187 */ 1188 if (used_on_disk + est_inflight >= quota) { 1189 if (est_inflight > 0 || used_on_disk < quota || 1190 (retval == ENOSPC && used_on_disk < quota + deferred)) 1191 retval = ERESTART; 1192 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " 1193 "quota=%lluK tr=%lluK err=%d\n", 1194 used_on_disk>>10, est_inflight>>10, 1195 quota>>10, asize>>10, retval); 1196 mutex_exit(&dd->dd_lock); 1197 return (SET_ERROR(retval)); 1198 } 1199 1200 /* We need to up our estimated delta before dropping dd_lock */ 1201 dd->dd_tempreserved[txgidx] += asize; 1202 1203 parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, 1204 asize - ref_rsrv); 1205 mutex_exit(&dd->dd_lock); 1206 1207 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1208 tr->tr_ds = dd; 1209 tr->tr_size = asize; 1210 list_insert_tail(tr_list, tr); 1211 1212 /* see if it's OK with our parent */ 1213 if (dd->dd_parent && parent_rsrv) { 1214 boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); 1215 1216 return (dsl_dir_tempreserve_impl(dd->dd_parent, 1217 parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); 1218 } else { 1219 return (0); 1220 } 1221 } 1222 1223 /* 1224 * Reserve space in this dsl_dir, to be used in this tx's txg. 1225 * After the space has been dirtied (and dsl_dir_willuse_space() 1226 * has been called), the reservation should be canceled, using 1227 * dsl_dir_tempreserve_clear(). 1228 */ 1229 int 1230 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, 1231 uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) 1232 { 1233 int err; 1234 list_t *tr_list; 1235 1236 if (asize == 0) { 1237 *tr_cookiep = NULL; 1238 return (0); 1239 } 1240 1241 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 1242 list_create(tr_list, sizeof (struct tempreserve), 1243 offsetof(struct tempreserve, tr_node)); 1244 ASSERT3S(asize, >, 0); 1245 ASSERT3S(fsize, >=, 0); 1246 1247 err = arc_tempreserve_space(lsize, tx->tx_txg); 1248 if (err == 0) { 1249 struct tempreserve *tr; 1250 1251 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1252 tr->tr_size = lsize; 1253 list_insert_tail(tr_list, tr); 1254 } else { 1255 if (err == EAGAIN) { 1256 /* 1257 * If arc_memory_throttle() detected that pageout 1258 * is running and we are low on memory, we delay new 1259 * non-pageout transactions to give pageout an 1260 * advantage. 1261 * 1262 * It is unfortunate to be delaying while the caller's 1263 * locks are held. 1264 */ 1265 txg_delay(dd->dd_pool, tx->tx_txg, 1266 MSEC2NSEC(10), MSEC2NSEC(10)); 1267 err = SET_ERROR(ERESTART); 1268 } 1269 } 1270 1271 if (err == 0) { 1272 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, 1273 FALSE, asize > usize, tr_list, tx, TRUE); 1274 } 1275 1276 if (err != 0) 1277 dsl_dir_tempreserve_clear(tr_list, tx); 1278 else 1279 *tr_cookiep = tr_list; 1280 1281 return (err); 1282 } 1283 1284 /* 1285 * Clear a temporary reservation that we previously made with 1286 * dsl_dir_tempreserve_space(). 1287 */ 1288 void 1289 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 1290 { 1291 int txgidx = tx->tx_txg & TXG_MASK; 1292 list_t *tr_list = tr_cookie; 1293 struct tempreserve *tr; 1294 1295 ASSERT3U(tx->tx_txg, !=, 0); 1296 1297 if (tr_cookie == NULL) 1298 return; 1299 1300 while ((tr = list_head(tr_list)) != NULL) { 1301 if (tr->tr_ds) { 1302 mutex_enter(&tr->tr_ds->dd_lock); 1303 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 1304 tr->tr_size); 1305 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 1306 mutex_exit(&tr->tr_ds->dd_lock); 1307 } else { 1308 arc_tempreserve_clear(tr->tr_size); 1309 } 1310 list_remove(tr_list, tr); 1311 kmem_free(tr, sizeof (struct tempreserve)); 1312 } 1313 1314 kmem_free(tr_list, sizeof (list_t)); 1315 } 1316 1317 /* 1318 * This should be called from open context when we think we're going to write 1319 * or free space, for example when dirtying data. Be conservative; it's okay 1320 * to write less space or free more, but we don't want to write more or free 1321 * less than the amount specified. 1322 */ 1323 void 1324 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 1325 { 1326 int64_t parent_space; 1327 uint64_t est_used; 1328 1329 mutex_enter(&dd->dd_lock); 1330 if (space > 0) 1331 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 1332 1333 est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; 1334 parent_space = parent_delta(dd, est_used, space); 1335 mutex_exit(&dd->dd_lock); 1336 1337 /* Make sure that we clean up dd_space_to* */ 1338 dsl_dir_dirty(dd, tx); 1339 1340 /* XXX this is potentially expensive and unnecessary... */ 1341 if (parent_space && dd->dd_parent) 1342 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); 1343 } 1344 1345 /* call from syncing context when we actually write/free space for this dd */ 1346 void 1347 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, 1348 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 1349 { 1350 int64_t accounted_delta; 1351 1352 /* 1353 * dsl_dataset_set_refreservation_sync_impl() calls this with 1354 * dd_lock held, so that it can atomically update 1355 * ds->ds_reserved and the dsl_dir accounting, so that 1356 * dsl_dataset_check_quota() can see dataset and dir accounting 1357 * consistently. 1358 */ 1359 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); 1360 1361 ASSERT(dmu_tx_is_syncing(tx)); 1362 ASSERT(type < DD_USED_NUM); 1363 1364 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1365 1366 if (needlock) 1367 mutex_enter(&dd->dd_lock); 1368 accounted_delta = 1369 parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); 1370 ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); 1371 ASSERT(compressed >= 0 || 1372 dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); 1373 ASSERT(uncompressed >= 0 || 1374 dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); 1375 dsl_dir_phys(dd)->dd_used_bytes += used; 1376 dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; 1377 dsl_dir_phys(dd)->dd_compressed_bytes += compressed; 1378 1379 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1380 ASSERT(used > 0 || 1381 dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); 1382 dsl_dir_phys(dd)->dd_used_breakdown[type] += used; 1383 #ifdef DEBUG 1384 dd_used_t t; 1385 uint64_t u = 0; 1386 for (t = 0; t < DD_USED_NUM; t++) 1387 u += dsl_dir_phys(dd)->dd_used_breakdown[t]; 1388 ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); 1389 #endif 1390 } 1391 if (needlock) 1392 mutex_exit(&dd->dd_lock); 1393 1394 if (dd->dd_parent != NULL) { 1395 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 1396 accounted_delta, compressed, uncompressed, tx); 1397 dsl_dir_transfer_space(dd->dd_parent, 1398 used - accounted_delta, 1399 DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); 1400 } 1401 } 1402 1403 void 1404 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, 1405 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) 1406 { 1407 ASSERT(dmu_tx_is_syncing(tx)); 1408 ASSERT(oldtype < DD_USED_NUM); 1409 ASSERT(newtype < DD_USED_NUM); 1410 1411 if (delta == 0 || 1412 !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) 1413 return; 1414 1415 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1416 mutex_enter(&dd->dd_lock); 1417 ASSERT(delta > 0 ? 1418 dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : 1419 dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); 1420 ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); 1421 dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; 1422 dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; 1423 mutex_exit(&dd->dd_lock); 1424 } 1425 1426 typedef struct dsl_dir_set_qr_arg { 1427 const char *ddsqra_name; 1428 zprop_source_t ddsqra_source; 1429 uint64_t ddsqra_value; 1430 } dsl_dir_set_qr_arg_t; 1431 1432 static int 1433 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) 1434 { 1435 dsl_dir_set_qr_arg_t *ddsqra = arg; 1436 dsl_pool_t *dp = dmu_tx_pool(tx); 1437 dsl_dataset_t *ds; 1438 int error; 1439 uint64_t towrite, newval; 1440 1441 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1442 if (error != 0) 1443 return (error); 1444 1445 error = dsl_prop_predict(ds->ds_dir, "quota", 1446 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1447 if (error != 0) { 1448 dsl_dataset_rele(ds, FTAG); 1449 return (error); 1450 } 1451 1452 if (newval == 0) { 1453 dsl_dataset_rele(ds, FTAG); 1454 return (0); 1455 } 1456 1457 mutex_enter(&ds->ds_dir->dd_lock); 1458 /* 1459 * If we are doing the preliminary check in open context, and 1460 * there are pending changes, then don't fail it, since the 1461 * pending changes could under-estimate the amount of space to be 1462 * freed up. 1463 */ 1464 towrite = dsl_dir_space_towrite(ds->ds_dir); 1465 if ((dmu_tx_is_syncing(tx) || towrite == 0) && 1466 (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || 1467 newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { 1468 error = SET_ERROR(ENOSPC); 1469 } 1470 mutex_exit(&ds->ds_dir->dd_lock); 1471 dsl_dataset_rele(ds, FTAG); 1472 return (error); 1473 } 1474 1475 static void 1476 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) 1477 { 1478 dsl_dir_set_qr_arg_t *ddsqra = arg; 1479 dsl_pool_t *dp = dmu_tx_pool(tx); 1480 dsl_dataset_t *ds; 1481 uint64_t newval; 1482 1483 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1484 1485 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1486 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), 1487 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1488 &ddsqra->ddsqra_value, tx); 1489 1490 VERIFY0(dsl_prop_get_int_ds(ds, 1491 zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); 1492 } else { 1493 newval = ddsqra->ddsqra_value; 1494 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1495 zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); 1496 } 1497 1498 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1499 mutex_enter(&ds->ds_dir->dd_lock); 1500 dsl_dir_phys(ds->ds_dir)->dd_quota = newval; 1501 mutex_exit(&ds->ds_dir->dd_lock); 1502 dsl_dataset_rele(ds, FTAG); 1503 } 1504 1505 int 1506 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) 1507 { 1508 dsl_dir_set_qr_arg_t ddsqra; 1509 1510 ddsqra.ddsqra_name = ddname; 1511 ddsqra.ddsqra_source = source; 1512 ddsqra.ddsqra_value = quota; 1513 1514 return (dsl_sync_task(ddname, dsl_dir_set_quota_check, 1515 dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); 1516 } 1517 1518 int 1519 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) 1520 { 1521 dsl_dir_set_qr_arg_t *ddsqra = arg; 1522 dsl_pool_t *dp = dmu_tx_pool(tx); 1523 dsl_dataset_t *ds; 1524 dsl_dir_t *dd; 1525 uint64_t newval, used, avail; 1526 int error; 1527 1528 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1529 if (error != 0) 1530 return (error); 1531 dd = ds->ds_dir; 1532 1533 /* 1534 * If we are doing the preliminary check in open context, the 1535 * space estimates may be inaccurate. 1536 */ 1537 if (!dmu_tx_is_syncing(tx)) { 1538 dsl_dataset_rele(ds, FTAG); 1539 return (0); 1540 } 1541 1542 error = dsl_prop_predict(ds->ds_dir, 1543 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1544 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1545 if (error != 0) { 1546 dsl_dataset_rele(ds, FTAG); 1547 return (error); 1548 } 1549 1550 mutex_enter(&dd->dd_lock); 1551 used = dsl_dir_phys(dd)->dd_used_bytes; 1552 mutex_exit(&dd->dd_lock); 1553 1554 if (dd->dd_parent) { 1555 avail = dsl_dir_space_available(dd->dd_parent, 1556 NULL, 0, FALSE); 1557 } else { 1558 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; 1559 } 1560 1561 if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { 1562 uint64_t delta = MAX(used, newval) - 1563 MAX(used, dsl_dir_phys(dd)->dd_reserved); 1564 1565 if (delta > avail || 1566 (dsl_dir_phys(dd)->dd_quota > 0 && 1567 newval > dsl_dir_phys(dd)->dd_quota)) 1568 error = SET_ERROR(ENOSPC); 1569 } 1570 1571 dsl_dataset_rele(ds, FTAG); 1572 return (error); 1573 } 1574 1575 void 1576 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) 1577 { 1578 uint64_t used; 1579 int64_t delta; 1580 1581 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1582 1583 mutex_enter(&dd->dd_lock); 1584 used = dsl_dir_phys(dd)->dd_used_bytes; 1585 delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); 1586 dsl_dir_phys(dd)->dd_reserved = value; 1587 1588 if (dd->dd_parent != NULL) { 1589 /* Roll up this additional usage into our ancestors */ 1590 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 1591 delta, 0, 0, tx); 1592 } 1593 mutex_exit(&dd->dd_lock); 1594 } 1595 1596 1597 static void 1598 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) 1599 { 1600 dsl_dir_set_qr_arg_t *ddsqra = arg; 1601 dsl_pool_t *dp = dmu_tx_pool(tx); 1602 dsl_dataset_t *ds; 1603 uint64_t newval; 1604 1605 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1606 1607 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1608 dsl_prop_set_sync_impl(ds, 1609 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1610 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1611 &ddsqra->ddsqra_value, tx); 1612 1613 VERIFY0(dsl_prop_get_int_ds(ds, 1614 zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); 1615 } else { 1616 newval = ddsqra->ddsqra_value; 1617 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1618 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1619 (longlong_t)newval); 1620 } 1621 1622 dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); 1623 dsl_dataset_rele(ds, FTAG); 1624 } 1625 1626 int 1627 dsl_dir_set_reservation(const char *ddname, zprop_source_t source, 1628 uint64_t reservation) 1629 { 1630 dsl_dir_set_qr_arg_t ddsqra; 1631 1632 ddsqra.ddsqra_name = ddname; 1633 ddsqra.ddsqra_source = source; 1634 ddsqra.ddsqra_value = reservation; 1635 1636 return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, 1637 dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); 1638 } 1639 1640 static dsl_dir_t * 1641 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1642 { 1643 for (; ds1; ds1 = ds1->dd_parent) { 1644 dsl_dir_t *dd; 1645 for (dd = ds2; dd; dd = dd->dd_parent) { 1646 if (ds1 == dd) 1647 return (dd); 1648 } 1649 } 1650 return (NULL); 1651 } 1652 1653 /* 1654 * If delta is applied to dd, how much of that delta would be applied to 1655 * ancestor? Syncing context only. 1656 */ 1657 static int64_t 1658 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1659 { 1660 if (dd == ancestor) 1661 return (delta); 1662 1663 mutex_enter(&dd->dd_lock); 1664 delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); 1665 mutex_exit(&dd->dd_lock); 1666 return (would_change(dd->dd_parent, delta, ancestor)); 1667 } 1668 1669 typedef struct dsl_dir_rename_arg { 1670 const char *ddra_oldname; 1671 const char *ddra_newname; 1672 cred_t *ddra_cred; 1673 } dsl_dir_rename_arg_t; 1674 1675 /* ARGSUSED */ 1676 static int 1677 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1678 { 1679 int *deltap = arg; 1680 char namebuf[MAXNAMELEN]; 1681 1682 dsl_dataset_name(ds, namebuf); 1683 1684 if (strlen(namebuf) + *deltap >= MAXNAMELEN) 1685 return (SET_ERROR(ENAMETOOLONG)); 1686 return (0); 1687 } 1688 1689 static int 1690 dsl_dir_rename_check(void *arg, dmu_tx_t *tx) 1691 { 1692 dsl_dir_rename_arg_t *ddra = arg; 1693 dsl_pool_t *dp = dmu_tx_pool(tx); 1694 dsl_dir_t *dd, *newparent; 1695 const char *mynewname; 1696 int error; 1697 int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); 1698 1699 /* target dir should exist */ 1700 error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); 1701 if (error != 0) 1702 return (error); 1703 1704 /* new parent should exist */ 1705 error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, 1706 &newparent, &mynewname); 1707 if (error != 0) { 1708 dsl_dir_rele(dd, FTAG); 1709 return (error); 1710 } 1711 1712 /* can't rename to different pool */ 1713 if (dd->dd_pool != newparent->dd_pool) { 1714 dsl_dir_rele(newparent, FTAG); 1715 dsl_dir_rele(dd, FTAG); 1716 return (SET_ERROR(ENXIO)); 1717 } 1718 1719 /* new name should not already exist */ 1720 if (mynewname == NULL) { 1721 dsl_dir_rele(newparent, FTAG); 1722 dsl_dir_rele(dd, FTAG); 1723 return (SET_ERROR(EEXIST)); 1724 } 1725 1726 /* if the name length is growing, validate child name lengths */ 1727 if (delta > 0) { 1728 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, 1729 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 1730 if (error != 0) { 1731 dsl_dir_rele(newparent, FTAG); 1732 dsl_dir_rele(dd, FTAG); 1733 return (error); 1734 } 1735 } 1736 1737 if (dmu_tx_is_syncing(tx)) { 1738 if (spa_feature_is_active(dp->dp_spa, 1739 SPA_FEATURE_FS_SS_LIMIT)) { 1740 /* 1741 * Although this is the check function and we don't 1742 * normally make on-disk changes in check functions, 1743 * we need to do that here. 1744 * 1745 * Ensure this portion of the tree's counts have been 1746 * initialized in case the new parent has limits set. 1747 */ 1748 dsl_dir_init_fs_ss_count(dd, tx); 1749 } 1750 } 1751 1752 if (newparent != dd->dd_parent) { 1753 /* is there enough space? */ 1754 uint64_t myspace = 1755 MAX(dsl_dir_phys(dd)->dd_used_bytes, 1756 dsl_dir_phys(dd)->dd_reserved); 1757 objset_t *os = dd->dd_pool->dp_meta_objset; 1758 uint64_t fs_cnt = 0; 1759 uint64_t ss_cnt = 0; 1760 1761 if (dsl_dir_is_zapified(dd)) { 1762 int err; 1763 1764 err = zap_lookup(os, dd->dd_object, 1765 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 1766 &fs_cnt); 1767 if (err != ENOENT && err != 0) { 1768 dsl_dir_rele(newparent, FTAG); 1769 dsl_dir_rele(dd, FTAG); 1770 return (err); 1771 } 1772 1773 /* 1774 * have to add 1 for the filesystem itself that we're 1775 * moving 1776 */ 1777 fs_cnt++; 1778 1779 err = zap_lookup(os, dd->dd_object, 1780 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 1781 &ss_cnt); 1782 if (err != ENOENT && err != 0) { 1783 dsl_dir_rele(newparent, FTAG); 1784 dsl_dir_rele(dd, FTAG); 1785 return (err); 1786 } 1787 } 1788 1789 /* no rename into our descendant */ 1790 if (closest_common_ancestor(dd, newparent) == dd) { 1791 dsl_dir_rele(newparent, FTAG); 1792 dsl_dir_rele(dd, FTAG); 1793 return (SET_ERROR(EINVAL)); 1794 } 1795 1796 error = dsl_dir_transfer_possible(dd->dd_parent, 1797 newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); 1798 if (error != 0) { 1799 dsl_dir_rele(newparent, FTAG); 1800 dsl_dir_rele(dd, FTAG); 1801 return (error); 1802 } 1803 } 1804 1805 dsl_dir_rele(newparent, FTAG); 1806 dsl_dir_rele(dd, FTAG); 1807 return (0); 1808 } 1809 1810 static void 1811 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) 1812 { 1813 dsl_dir_rename_arg_t *ddra = arg; 1814 dsl_pool_t *dp = dmu_tx_pool(tx); 1815 dsl_dir_t *dd, *newparent; 1816 const char *mynewname; 1817 int error; 1818 objset_t *mos = dp->dp_meta_objset; 1819 1820 VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); 1821 VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, 1822 &mynewname)); 1823 1824 /* Log this before we change the name. */ 1825 spa_history_log_internal_dd(dd, "rename", tx, 1826 "-> %s", ddra->ddra_newname); 1827 1828 if (newparent != dd->dd_parent) { 1829 objset_t *os = dd->dd_pool->dp_meta_objset; 1830 uint64_t fs_cnt = 0; 1831 uint64_t ss_cnt = 0; 1832 1833 /* 1834 * We already made sure the dd counts were initialized in the 1835 * check function. 1836 */ 1837 if (spa_feature_is_active(dp->dp_spa, 1838 SPA_FEATURE_FS_SS_LIMIT)) { 1839 VERIFY0(zap_lookup(os, dd->dd_object, 1840 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 1841 &fs_cnt)); 1842 /* add 1 for the filesystem itself that we're moving */ 1843 fs_cnt++; 1844 1845 VERIFY0(zap_lookup(os, dd->dd_object, 1846 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 1847 &ss_cnt)); 1848 } 1849 1850 dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, 1851 DD_FIELD_FILESYSTEM_COUNT, tx); 1852 dsl_fs_ss_count_adjust(newparent, fs_cnt, 1853 DD_FIELD_FILESYSTEM_COUNT, tx); 1854 1855 dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, 1856 DD_FIELD_SNAPSHOT_COUNT, tx); 1857 dsl_fs_ss_count_adjust(newparent, ss_cnt, 1858 DD_FIELD_SNAPSHOT_COUNT, tx); 1859 1860 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 1861 -dsl_dir_phys(dd)->dd_used_bytes, 1862 -dsl_dir_phys(dd)->dd_compressed_bytes, 1863 -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 1864 dsl_dir_diduse_space(newparent, DD_USED_CHILD, 1865 dsl_dir_phys(dd)->dd_used_bytes, 1866 dsl_dir_phys(dd)->dd_compressed_bytes, 1867 dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 1868 1869 if (dsl_dir_phys(dd)->dd_reserved > 1870 dsl_dir_phys(dd)->dd_used_bytes) { 1871 uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - 1872 dsl_dir_phys(dd)->dd_used_bytes; 1873 1874 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 1875 -unused_rsrv, 0, 0, tx); 1876 dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, 1877 unused_rsrv, 0, 0, tx); 1878 } 1879 } 1880 1881 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1882 1883 /* remove from old parent zapobj */ 1884 error = zap_remove(mos, 1885 dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, 1886 dd->dd_myname, tx); 1887 ASSERT0(error); 1888 1889 (void) strcpy(dd->dd_myname, mynewname); 1890 dsl_dir_rele(dd->dd_parent, dd); 1891 dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; 1892 VERIFY0(dsl_dir_hold_obj(dp, 1893 newparent->dd_object, NULL, dd, &dd->dd_parent)); 1894 1895 /* add to new parent zapobj */ 1896 VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, 1897 dd->dd_myname, 8, 1, &dd->dd_object, tx)); 1898 1899 dsl_prop_notify_all(dd); 1900 1901 dsl_dir_rele(newparent, FTAG); 1902 dsl_dir_rele(dd, FTAG); 1903 } 1904 1905 int 1906 dsl_dir_rename(const char *oldname, const char *newname) 1907 { 1908 dsl_dir_rename_arg_t ddra; 1909 1910 ddra.ddra_oldname = oldname; 1911 ddra.ddra_newname = newname; 1912 ddra.ddra_cred = CRED(); 1913 1914 return (dsl_sync_task(oldname, 1915 dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 1916 3, ZFS_SPACE_CHECK_RESERVED)); 1917 } 1918 1919 int 1920 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, 1921 uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) 1922 { 1923 dsl_dir_t *ancestor; 1924 int64_t adelta; 1925 uint64_t avail; 1926 int err; 1927 1928 ancestor = closest_common_ancestor(sdd, tdd); 1929 adelta = would_change(sdd, -space, ancestor); 1930 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); 1931 if (avail < space) 1932 return (SET_ERROR(ENOSPC)); 1933 1934 err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, 1935 ancestor, cr); 1936 if (err != 0) 1937 return (err); 1938 err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, 1939 ancestor, cr); 1940 if (err != 0) 1941 return (err); 1942 1943 return (0); 1944 } 1945 1946 timestruc_t 1947 dsl_dir_snap_cmtime(dsl_dir_t *dd) 1948 { 1949 timestruc_t t; 1950 1951 mutex_enter(&dd->dd_lock); 1952 t = dd->dd_snap_cmtime; 1953 mutex_exit(&dd->dd_lock); 1954 1955 return (t); 1956 } 1957 1958 void 1959 dsl_dir_snap_cmtime_update(dsl_dir_t *dd) 1960 { 1961 timestruc_t t; 1962 1963 gethrestime(&t); 1964 mutex_enter(&dd->dd_lock); 1965 dd->dd_snap_cmtime = t; 1966 mutex_exit(&dd->dd_lock); 1967 } 1968 1969 void 1970 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) 1971 { 1972 objset_t *mos = dd->dd_pool->dp_meta_objset; 1973 dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); 1974 } 1975 1976 boolean_t 1977 dsl_dir_is_zapified(dsl_dir_t *dd) 1978 { 1979 dmu_object_info_t doi; 1980 1981 dmu_object_info_from_db(dd->dd_dbuf, &doi); 1982 return (doi.doi_type == DMU_OTN_ZAP_METADATA); 1983 }