ReactOS  0.4.15-dev-5142-g967f5b9
balance.c
Go to the documentation of this file.
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include "crc32c.h"
21 #include <ntddstor.h>
22 
23 typedef struct {
28  tree* t;
29  bool system;
33 
34 typedef struct {
37 
38  union {
41  };
42 
44  bool top;
47 
48 typedef struct {
56 } data_reloc;
57 
58 typedef struct {
61 
62  union {
65  };
66 
70 
71 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
72 
74  bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
76  metadata_reloc* mr;
77  EXTENT_ITEM* ei;
78  uint16_t len;
79  uint64_t inline_rc;
80  uint8_t* ptr;
81 
83  if (!mr) {
84  ERR("out of memory\n");
86  }
87 
88  mr->address = tp->item->key.obj_id;
89  mr->data = NULL;
90  mr->ei = (EXTENT_ITEM*)tp->item->data;
91  mr->system = false;
93 
95  if (!NT_SUCCESS(Status)) {
96  ERR("delete_tree_item returned %08lx\n", Status);
97  ExFreePool(mr);
98  return Status;
99  }
100 
101  if (!c)
103 
104  if (c) {
106 
107  c->used -= Vcb->superblock.node_size;
108 
109  space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
110 
112  }
113 
114  ei = (EXTENT_ITEM*)tp->item->data;
115  inline_rc = 0;
116 
117  len = tp->item->size - sizeof(EXTENT_ITEM);
118  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
119  if (!skinny) {
120  len -= sizeof(EXTENT_ITEM2);
121  ptr += sizeof(EXTENT_ITEM2);
122  }
123 
124  while (len > 0) {
125  uint8_t secttype = *ptr;
126  uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
128 
129  len--;
130 
131  if (sectlen > len) {
132  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
133  return STATUS_INTERNAL_ERROR;
134  }
135 
136  if (sectlen == 0) {
137  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
138  return STATUS_INTERNAL_ERROR;
139  }
140 
142  if (!ref) {
143  ERR("out of memory\n");
145  }
146 
147  if (secttype == TYPE_TREE_BLOCK_REF) {
148  ref->type = TYPE_TREE_BLOCK_REF;
149  RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
150  inline_rc++;
151  } else if (secttype == TYPE_SHARED_BLOCK_REF) {
152  ref->type = TYPE_SHARED_BLOCK_REF;
153  RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
154  inline_rc++;
155  } else {
156  ERR("unexpected tree type %x\n", secttype);
157  ExFreePool(ref);
158  return STATUS_INTERNAL_ERROR;
159  }
160 
161  ref->parent = NULL;
162  ref->top = false;
164 
165  len -= sectlen;
166  ptr += sizeof(uint8_t) + sectlen;
167  }
168 
169  if (inline_rc < ei->refcount) { // look for non-inline entries
170  traverse_ptr tp2 = *tp, next_tp;
171 
172  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
173  tp2 = next_tp;
174 
175  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
176  if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
178  if (!ref) {
179  ERR("out of memory\n");
181  }
182 
183  ref->type = TYPE_TREE_BLOCK_REF;
184  ref->tbr.offset = tp2.item->key.offset;
185  ref->parent = NULL;
186  ref->top = false;
188 
189  Status = delete_tree_item(Vcb, &tp2);
190  if (!NT_SUCCESS(Status)) {
191  ERR("delete_tree_item returned %08lx\n", Status);
192  return Status;
193  }
194  } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
196  if (!ref) {
197  ERR("out of memory\n");
199  }
200 
201  ref->type = TYPE_SHARED_BLOCK_REF;
202  ref->sbr.offset = tp2.item->key.offset;
203  ref->parent = NULL;
204  ref->top = false;
206 
207  Status = delete_tree_item(Vcb, &tp2);
208  if (!NT_SUCCESS(Status)) {
209  ERR("delete_tree_item returned %08lx\n", Status);
210  return Status;
211  }
212  }
213  } else
214  break;
215  }
216  }
217 
219 
220  if (mr2)
221  *mr2 = mr;
222 
223  return STATUS_SUCCESS;
224 }
225 
228  LIST_ENTRY* le;
229  KEY searchkey;
231  bool skinny = false;
233 
234  le = items->Flink;
235  while (le != items) {
237 
238  if (mr->address == address) {
239  *mr2 = mr;
240  return STATUS_SUCCESS;
241  }
242 
243  le = le->Flink;
244  }
245 
246  searchkey.obj_id = address;
247  searchkey.obj_type = TYPE_METADATA_ITEM;
248  searchkey.offset = 0xffffffffffffffff;
249 
250  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
251  if (!NT_SUCCESS(Status)) {
252  ERR("find_item returned %08lx\n", Status);
253  return Status;
254  }
255 
257  skinny = true;
258  else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
259  tp.item->size >= sizeof(EXTENT_ITEM)) {
261 
262  if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
263  ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
264  return STATUS_INTERNAL_ERROR;
265  }
266  } else {
267  ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
268  return STATUS_INTERNAL_ERROR;
269  }
270 
271  Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
272  if (!NT_SUCCESS(Status)) {
273  ERR("add_metadata_reloc returned %08lx\n", Status);
274  return Status;
275  }
276 
277  return STATUS_SUCCESS;
278 }
279 
281  LIST_ENTRY newlist, *le;
282 
283  if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
284  return;
285 
286  // insertion sort
287 
288  InitializeListHead(&newlist);
289 
290  while (!IsListEmpty(&mr->refs)) {
292  bool inserted = false;
293 
294  if (ref->type == TYPE_TREE_BLOCK_REF)
295  ref->hash = ref->tbr.offset;
296  else if (ref->type == TYPE_SHARED_BLOCK_REF)
297  ref->hash = ref->parent->new_address;
298 
299  le = newlist.Flink;
300  while (le != &newlist) {
302 
303  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
305  inserted = true;
306  break;
307  }
308 
309  le = le->Flink;
310  }
311 
312  if (!inserted)
313  InsertTailList(&newlist, &ref->list_entry);
314  }
315 
316  newlist.Flink->Blink = &mr->refs;
317  newlist.Blink->Flink = &mr->refs;
318  mr->refs.Flink = newlist.Flink;
319  mr->refs.Blink = newlist.Blink;
320 }
321 
324  LIST_ENTRY* le;
325  uint64_t rc = 0;
326  uint16_t inline_len;
327  bool all_inline = true;
328  metadata_reloc_ref* first_noninline = NULL;
329  EXTENT_ITEM* ei;
330  uint8_t* ptr;
331 
332  inline_len = sizeof(EXTENT_ITEM);
333  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
334  inline_len += sizeof(EXTENT_ITEM2);
335 
337 
338  le = mr->refs.Flink;
339  while (le != &mr->refs) {
341  uint16_t extlen = 0;
342 
343  rc++;
344 
345  if (ref->type == TYPE_TREE_BLOCK_REF)
346  extlen += sizeof(TREE_BLOCK_REF);
347  else if (ref->type == TYPE_SHARED_BLOCK_REF)
348  extlen += sizeof(SHARED_BLOCK_REF);
349 
350  if (all_inline) {
351  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
352  all_inline = false;
353  first_noninline = ref;
354  } else
355  inline_len += extlen + 1;
356  }
357 
358  le = le->Flink;
359  }
360 
361  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
362  if (!ei) {
363  ERR("out of memory\n");
365  }
366 
367  ei->refcount = rc;
368  ei->generation = mr->ei->generation;
369  ei->flags = mr->ei->flags;
370  ptr = (uint8_t*)&ei[1];
371 
372  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
373  EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
374 
375  ei2->firstitem = *(KEY*)&mr->data[1];
376  ei2->level = mr->data->level;
377 
378  ptr += sizeof(EXTENT_ITEM2);
379  }
380 
381  le = mr->refs.Flink;
382  while (le != &mr->refs) {
384 
385  if (ref == first_noninline)
386  break;
387 
388  *ptr = ref->type;
389  ptr++;
390 
391  if (ref->type == TYPE_TREE_BLOCK_REF) {
393 
394  tbr->offset = ref->tbr.offset;
395 
396  ptr += sizeof(TREE_BLOCK_REF);
397  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
399 
400  sbr->offset = ref->parent->new_address;
401 
402  ptr += sizeof(SHARED_BLOCK_REF);
403  }
404 
405  le = le->Flink;
406  }
407 
408  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
409  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
410  else
411  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
412 
413  if (!NT_SUCCESS(Status)) {
414  ERR("insert_tree_item returned %08lx\n", Status);
415  ExFreePool(ei);
416  return Status;
417  }
418 
419  if (!all_inline) {
420  le = &first_noninline->list_entry;
421 
422  while (le != &mr->refs) {
424 
425  if (ref->type == TYPE_TREE_BLOCK_REF) {
426  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
427  if (!NT_SUCCESS(Status)) {
428  ERR("insert_tree_item returned %08lx\n", Status);
429  return Status;
430  }
431  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
432  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
433  if (!NT_SUCCESS(Status)) {
434  ERR("insert_tree_item returned %08lx\n", Status);
435  return Status;
436  }
437  }
438 
439  le = le->Flink;
440  }
441  }
442 
444  if (mr->data->level > 0) {
445  uint16_t i;
446  internal_node* in = (internal_node*)&mr->data[1];
447 
448  for (i = 0; i < mr->data->num_items; i++) {
450 
451  if (sbrrc > 0) {
452  SHARED_BLOCK_REF sbr;
453 
454  sbr.offset = mr->new_address;
455 
456  Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
457  if (!NT_SUCCESS(Status)) {
458  ERR("increase_extent_refcount returned %08lx\n", Status);
459  return Status;
460  }
461 
462  sbr.offset = mr->address;
463 
464  Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
465  sbr.offset, false, NULL);
466  if (!NT_SUCCESS(Status)) {
467  ERR("decrease_extent_refcount returned %08lx\n", Status);
468  return Status;
469  }
470  }
471  }
472  } else {
473  uint16_t i;
474  leaf_node* ln = (leaf_node*)&mr->data[1];
475 
476  for (i = 0; i < mr->data->num_items; i++) {
477  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
478  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
479 
480  if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
481  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
482 
483  if (ed2->size > 0) { // not sparse
485 
486  if (sdrrc > 0) {
487  SHARED_DATA_REF sdr;
488  chunk* c;
489 
490  sdr.offset = mr->new_address;
491  sdr.count = sdrrc;
492 
494  if (!NT_SUCCESS(Status)) {
495  ERR("increase_extent_refcount returned %08lx\n", Status);
496  return Status;
497  }
498 
499  sdr.offset = mr->address;
500 
502  sdr.offset, false, NULL);
503  if (!NT_SUCCESS(Status)) {
504  ERR("decrease_extent_refcount returned %08lx\n", Status);
505  return Status;
506  }
507 
509 
510  if (c) {
511  // check changed_extents
512 
513  ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
514 
515  le = c->changed_extents.Flink;
516 
517  while (le != &c->changed_extents) {
519 
520  if (ce->address == ed2->address) {
521  LIST_ENTRY* le2;
522 
523  le2 = ce->refs.Flink;
524  while (le2 != &ce->refs) {
526 
527  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
528  cer->sdr.offset = mr->new_address;
529  break;
530  }
531 
532  le2 = le2->Flink;
533  }
534 
535  le2 = ce->old_refs.Flink;
536  while (le2 != &ce->old_refs) {
538 
539  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
540  cer->sdr.offset = mr->new_address;
541  break;
542  }
543 
544  le2 = le2->Flink;
545  }
546 
547  break;
548  }
549 
550  le = le->Flink;
551  }
552 
553  ExReleaseResourceLite(&c->changed_extents_lock);
554  }
555  }
556  }
557  }
558  }
559  }
560  }
561  }
562 
563  return STATUS_SUCCESS;
564 }
565 
567  LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
568  LIST_ENTRY tree_writes, *le;
571  uint8_t level, max_level = 0;
572  chunk* newchunk = NULL;
573 
574  InitializeListHead(&tree_writes);
575 
576  le = items->Flink;
577  while (le != items) {
579  LIST_ENTRY* le2;
580  chunk* pc;
581 
582  mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
583  if (!mr->data) {
584  ERR("out of memory\n");
586  }
587 
588  Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
589  c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
590  if (!NT_SUCCESS(Status)) {
591  ERR("read_data returned %08lx\n", Status);
592  return Status;
593  }
594 
595  if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
596  mr->system = true;
597 
598  if (data_items && mr->data->level == 0) {
599  le2 = data_items->Flink;
600  while (le2 != data_items) {
602  leaf_node* ln = (leaf_node*)&mr->data[1];
603  uint16_t i;
604 
605  for (i = 0; i < mr->data->num_items; i++) {
606  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
607  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
608 
609  if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
610  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
611 
612  if (ed2->address == dr->address)
613  ed2->address = dr->new_address;
614  }
615  }
616  }
617 
618  le2 = le2->Flink;
619  }
620  }
621 
622  if (mr->data->level > max_level)
623  max_level = mr->data->level;
624 
625  le2 = mr->refs.Flink;
626  while (le2 != &mr->refs) {
628 
629  if (ref->type == TYPE_TREE_BLOCK_REF) {
630  KEY* firstitem;
631  root* r = NULL;
632  LIST_ENTRY* le3;
633  tree* t;
634 
635  firstitem = (KEY*)&mr->data[1];
636 
637  le3 = Vcb->roots.Flink;
638  while (le3 != &Vcb->roots) {
640 
641  if (r2->id == ref->tbr.offset) {
642  r = r2;
643  break;
644  }
645 
646  le3 = le3->Flink;
647  }
648 
649  if (!r) {
650  ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
651  return STATUS_INTERNAL_ERROR;
652  }
653 
654  Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
656  ERR("find_item_to_level returned %08lx\n", Status);
657  return Status;
658  }
659 
660  t = tp.tree;
661  while (t && t->header.level < mr->data->level + 1) {
662  t = t->parent;
663  }
664 
665  if (!t)
666  ref->top = true;
667  else {
668  metadata_reloc* mr2;
669 
670  Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
671  if (!NT_SUCCESS(Status)) {
672  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
673  return Status;
674  }
675 
676  ref->parent = mr2;
677  }
678  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
679  metadata_reloc* mr2;
680 
681  Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
682  if (!NT_SUCCESS(Status)) {
683  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
684  return Status;
685  }
686 
687  ref->parent = mr2;
688  }
689 
690  le2 = le2->Flink;
691  }
692 
693  le = le->Flink;
694  }
695 
696  le = items->Flink;
697  while (le != items) {
699  LIST_ENTRY* le2;
700  uint32_t hash;
701 
702  mr->t = NULL;
703 
704  hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
705 
706  le2 = Vcb->trees_ptrs[hash >> 24];
707 
708  if (le2) {
709  while (le2 != &Vcb->trees_hash) {
710  tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
711 
712  if (t->header.address == mr->address) {
713  mr->t = t;
714  break;
715  } else if (t->hash > hash)
716  break;
717 
718  le2 = le2->Flink;
719  }
720  }
721 
722  le = le->Flink;
723  }
724 
725  for (level = 0; level <= max_level; level++) {
726  le = items->Flink;
727  while (le != items) {
729 
730  if (mr->data->level == level) {
731  bool done = false;
732  LIST_ENTRY* le2;
733  tree_write* tw;
734  uint64_t flags;
735  tree* t3;
736 
737  if (mr->system)
738  flags = Vcb->system_flags;
739  else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
740  flags = Vcb->data_flags;
741  else
742  flags = Vcb->metadata_flags;
743 
744  if (newchunk) {
745  acquire_chunk_lock(newchunk, Vcb);
746 
747  if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
748  newchunk->used += Vcb->superblock.node_size;
749  space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback);
750  done = true;
751  }
752 
753  release_chunk_lock(newchunk, Vcb);
754  }
755 
756  if (!done) {
757  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
758 
759  le2 = Vcb->chunks.Flink;
760  while (le2 != &Vcb->chunks) {
762 
763  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
764  acquire_chunk_lock(c2, Vcb);
765 
766  if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
768  c2->used += Vcb->superblock.node_size;
769  space_list_subtract(c2, mr->new_address, Vcb->superblock.node_size, rollback);
770  release_chunk_lock(c2, Vcb);
771  newchunk = c2;
772  done = true;
773  break;
774  }
775  }
776 
777  release_chunk_lock(c2, Vcb);
778  }
779 
780  le2 = le2->Flink;
781  }
782 
783  // allocate new chunk if necessary
784  if (!done) {
785  Status = alloc_chunk(Vcb, flags, &newchunk, false);
786 
787  if (!NT_SUCCESS(Status)) {
788  ERR("alloc_chunk returned %08lx\n", Status);
789  ExReleaseResourceLite(&Vcb->chunk_lock);
790  goto end;
791  }
792 
793  acquire_chunk_lock(newchunk, Vcb);
794 
795  newchunk->balance_num = Vcb->balance.balance_num;
796 
797  if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
798  release_chunk_lock(newchunk, Vcb);
799  ExReleaseResourceLite(&Vcb->chunk_lock);
800  ERR("could not find address in new chunk\n");
802  goto end;
803  } else {
804  newchunk->used += Vcb->superblock.node_size;
805  space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback);
806  }
807 
808  release_chunk_lock(newchunk, Vcb);
809  }
810 
811  ExReleaseResourceLite(&Vcb->chunk_lock);
812  }
813 
814  // update parents
815  le2 = mr->refs.Flink;
816  while (le2 != &mr->refs) {
818 
819  if (ref->parent) {
820  uint16_t i;
821  internal_node* in = (internal_node*)&ref->parent->data[1];
822 
823  for (i = 0; i < ref->parent->data->num_items; i++) {
824  if (in[i].address == mr->address) {
825  in[i].address = mr->new_address;
826  break;
827  }
828  }
829 
830  if (ref->parent->t) {
831  LIST_ENTRY* le3;
832 
833  le3 = ref->parent->t->itemlist.Flink;
834  while (le3 != &ref->parent->t->itemlist) {
836 
837  if (!td->inserted && td->treeholder.address == mr->address)
838  td->treeholder.address = mr->new_address;
839 
840  le3 = le3->Flink;
841  }
842  }
843  } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
844  LIST_ENTRY* le3;
845  root* r = NULL;
846 
847  // alter ROOT_ITEM
848 
849  le3 = Vcb->roots.Flink;
850  while (le3 != &Vcb->roots) {
852 
853  if (r2->id == ref->tbr.offset) {
854  r = r2;
855  break;
856  }
857 
858  le3 = le3->Flink;
859  }
860 
861  if (r) {
862  r->treeholder.address = mr->new_address;
863 
864  if (r == Vcb->root_root)
865  Vcb->superblock.root_tree_addr = mr->new_address;
866  else if (r == Vcb->chunk_root)
867  Vcb->superblock.chunk_tree_addr = mr->new_address;
868  else if (r->root_item.block_number == mr->address) {
869  KEY searchkey;
870  ROOT_ITEM* ri;
871 
872  r->root_item.block_number = mr->new_address;
873 
874  searchkey.obj_id = r->id;
875  searchkey.obj_type = TYPE_ROOT_ITEM;
876  searchkey.offset = 0xffffffffffffffff;
877 
878  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
879  if (!NT_SUCCESS(Status)) {
880  ERR("find_item returned %08lx\n", Status);
881  goto end;
882  }
883 
884  if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
885  ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
887  goto end;
888  }
889 
891  if (!ri) {
892  ERR("out of memory\n");
894  goto end;
895  }
896 
897  RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
898 
900  if (!NT_SUCCESS(Status)) {
901  ERR("delete_tree_item returned %08lx\n", Status);
902  goto end;
903  }
904 
905  Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
906  if (!NT_SUCCESS(Status)) {
907  ERR("insert_tree_item returned %08lx\n", Status);
908  goto end;
909  }
910  }
911  }
912  }
913 
914  le2 = le2->Flink;
915  }
916 
917  mr->data->address = mr->new_address;
918 
919  t3 = mr->t;
920 
921  while (t3) {
922  uint8_t h;
923  bool inserted;
924  tree* t4 = NULL;
925 
926  // check if tree loaded more than once
927  if (t3->list_entry.Flink != &Vcb->trees_hash) {
928  tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
929 
930  if (nt->header.address == t3->header.address)
931  t4 = nt;
932  }
933 
934  t3->header.address = mr->new_address;
935 
936  h = t3->hash >> 24;
937 
938  if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
939  if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
940  Vcb->trees_ptrs[h] = NULL;
941  else {
942  tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
943 
944  if (t2->hash >> 24 == h)
945  Vcb->trees_ptrs[h] = &t2->list_entry_hash;
946  else
947  Vcb->trees_ptrs[h] = NULL;
948  }
949  }
950 
952 
953  t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
954  h = t3->hash >> 24;
955 
956  if (!Vcb->trees_ptrs[h]) {
957  uint8_t h2 = h;
958 
959  le2 = Vcb->trees_hash.Flink;
960 
961  if (h2 > 0) {
962  h2--;
963  do {
964  if (Vcb->trees_ptrs[h2]) {
965  le2 = Vcb->trees_ptrs[h2];
966  break;
967  }
968 
969  h2--;
970  } while (h2 > 0);
971  }
972  } else
973  le2 = Vcb->trees_ptrs[h];
974 
975  inserted = false;
976  while (le2 != &Vcb->trees_hash) {
977  tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
978 
979  if (t2->hash >= t3->hash) {
981  inserted = true;
982  break;
983  }
984 
985  le2 = le2->Flink;
986  }
987 
988  if (!inserted)
989  InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
990 
991  if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
992  Vcb->trees_ptrs[h] = &t3->list_entry_hash;
993 
994  if (data_items && level == 0) {
995  le2 = data_items->Flink;
996 
997  while (le2 != data_items) {
999  LIST_ENTRY* le3 = t3->itemlist.Flink;
1000 
1001  while (le3 != &t3->itemlist) {
1003 
1004  if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1005  EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1006 
1007  if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1008  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1009 
1010  if (ed2->address == dr->address)
1011  ed2->address = dr->new_address;
1012  }
1013  }
1014 
1015  le3 = le3->Flink;
1016  }
1017 
1018  le2 = le2->Flink;
1019  }
1020  }
1021 
1022  t3 = t4;
1023  }
1024 
1025  calc_tree_checksum(Vcb, mr->data);
1026 
1028  if (!tw) {
1029  ERR("out of memory\n");
1031  goto end;
1032  }
1033 
1034  tw->address = mr->new_address;
1035  tw->length = Vcb->superblock.node_size;
1036  tw->data = (uint8_t*)mr->data;
1037  tw->allocated = false;
1038 
1039  if (IsListEmpty(&tree_writes))
1040  InsertTailList(&tree_writes, &tw->list_entry);
1041  else {
1042  bool inserted = false;
1043 
1044  le2 = tree_writes.Flink;
1045  while (le2 != &tree_writes) {
1047 
1048  if (tw2->address > tw->address) {
1049  InsertHeadList(le2->Blink, &tw->list_entry);
1050  inserted = true;
1051  break;
1052  }
1053 
1054  le2 = le2->Flink;
1055  }
1056 
1057  if (!inserted)
1058  InsertTailList(&tree_writes, &tw->list_entry);
1059  }
1060  }
1061 
1062  le = le->Flink;
1063  }
1064  }
1065 
1066  Status = do_tree_writes(Vcb, &tree_writes, true);
1067  if (!NT_SUCCESS(Status)) {
1068  ERR("do_tree_writes returned %08lx\n", Status);
1069  goto end;
1070  }
1071 
1072  le = items->Flink;
1073  while (le != items) {
1075 
1077  if (!NT_SUCCESS(Status)) {
1078  ERR("add_metadata_reloc_extent_item returned %08lx\n", Status);
1079  goto end;
1080  }
1081 
1082  le = le->Flink;
1083  }
1084 
1086 
1087 end:
1088  while (!IsListEmpty(&tree_writes)) {
1090 
1091  if (tw->allocated)
1092  ExFreePool(tw->data);
1093 
1094  ExFreePool(tw);
1095  }
1096 
1097  return Status;
1098 }
1099 
1101  KEY searchkey;
1102  traverse_ptr tp;
1103  NTSTATUS Status;
1104  bool b;
1106  uint32_t loaded = 0;
1107 
1108  TRACE("chunk %I64x\n", c->offset);
1109 
1112 
1113  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1114 
1115  searchkey.obj_id = c->offset;
1116  searchkey.obj_type = TYPE_METADATA_ITEM;
1117  searchkey.offset = 0xffffffffffffffff;
1118 
1119  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1120  if (!NT_SUCCESS(Status)) {
1121  ERR("find_item returned %08lx\n", Status);
1122  goto end;
1123  }
1124 
1125  do {
1126  traverse_ptr next_tp;
1127 
1128  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1129  break;
1130 
1131  if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1132  bool tree = false, skinny = false;
1133 
1134  if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1135  tree = true;
1136  skinny = true;
1137  } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1138  tp.item->size >= sizeof(EXTENT_ITEM)) {
1139  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1140 
1142  tree = true;
1143  }
1144 
1145  if (tree) {
1146  Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1147 
1148  if (!NT_SUCCESS(Status)) {
1149  ERR("add_metadata_reloc returned %08lx\n", Status);
1150  goto end;
1151  }
1152 
1153  loaded++;
1154 
1155  if (loaded >= 64) // only do 64 at a time
1156  break;
1157  }
1158  }
1159 
1160  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1161 
1162  if (b)
1163  tp = next_tp;
1164  } while (b);
1165 
1166  if (IsListEmpty(&items)) {
1167  *changed = false;
1169  goto end;
1170  } else
1171  *changed = true;
1172 
1174  if (!NT_SUCCESS(Status)) {
1175  ERR("write_metadata_items returned %08lx\n", Status);
1176  goto end;
1177  }
1178 
1180 
1181  Vcb->need_write = true;
1182 
1183 end:
1184  if (NT_SUCCESS(Status)) {
1185  Status = do_write(Vcb, NULL);
1186  if (!NT_SUCCESS(Status))
1187  ERR("do_write returned %08lx\n", Status);
1188  }
1189 
1190  if (NT_SUCCESS(Status))
1192  else
1194 
1195  free_trees(Vcb);
1196 
1197  ExReleaseResourceLite(&Vcb->tree_lock);
1198 
1199  while (!IsListEmpty(&items)) {
1201 
1202  while (!IsListEmpty(&mr->refs)) {
1204 
1205  ExFreePool(ref);
1206  }
1207 
1208  if (mr->data)
1209  ExFreePool(mr->data);
1210 
1211  ExFreePool(mr);
1212  }
1213 
1214  return Status;
1215 }
1216 
1219  NTSTATUS Status;
1220  LIST_ENTRY* le;
1221  KEY searchkey;
1222  traverse_ptr tp;
1223  root* r = NULL;
1224  metadata_reloc* mr;
1225  uint64_t last_tree = 0;
1227 
1228  le = Vcb->roots.Flink;
1229  while (le != &Vcb->roots) {
1231 
1232  if (r2->id == edr->root) {
1233  r = r2;
1234  break;
1235  }
1236 
1237  le = le->Flink;
1238  }
1239 
1240  if (!r) {
1241  ERR("could not find subvol %I64x\n", edr->root);
1242  return STATUS_INTERNAL_ERROR;
1243  }
1244 
1245  searchkey.obj_id = edr->objid;
1246  searchkey.obj_type = TYPE_EXTENT_DATA;
1247  searchkey.offset = 0;
1248 
1249  Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1250  if (!NT_SUCCESS(Status)) {
1251  ERR("find_item returned %08lx\n", Status);
1252  return Status;
1253  }
1254 
1255  if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1256  traverse_ptr tp2;
1257 
1258  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1259  tp = tp2;
1260  else {
1261  ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1262  return STATUS_INTERNAL_ERROR;
1263  }
1264  }
1265 
1266  ref = NULL;
1267 
1268  while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1269  traverse_ptr tp2;
1270 
1271  if (tp.item->size >= sizeof(EXTENT_DATA)) {
1272  EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1273 
1274  if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1275  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1276 
1277  if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1278  if (ref && last_tree == tp.tree->header.address)
1279  ref->edr.count++;
1280  else {
1282  if (!ref) {
1283  ERR("out of memory\n");
1285  }
1286 
1287  ref->type = TYPE_EXTENT_DATA_REF;
1288  RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1289  ref->edr.count = 1;
1290 
1291  Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1292  if (!NT_SUCCESS(Status)) {
1293  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1294  ExFreePool(ref);
1295  return Status;
1296  }
1297 
1298  last_tree = tp.tree->header.address;
1299  ref->parent = mr;
1300 
1301  InsertTailList(&dr->refs, &ref->list_entry);
1302  }
1303  }
1304  }
1305  }
1306 
1307  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1308  tp = tp2;
1309  else
1310  break;
1311  }
1312 
1313  return STATUS_SUCCESS;
1314 }
1315 
1318  NTSTATUS Status;
1319  data_reloc* dr;
1320  EXTENT_ITEM* ei;
1321  uint16_t len;
1322  uint64_t inline_rc;
1323  uint8_t* ptr;
1324 
1326  if (!dr) {
1327  ERR("out of memory\n");
1329  }
1330 
1331  dr->address = tp->item->key.obj_id;
1332  dr->size = tp->item->key.offset;
1333  dr->ei = (EXTENT_ITEM*)tp->item->data;
1334  InitializeListHead(&dr->refs);
1335 
1337  if (!NT_SUCCESS(Status)) {
1338  ERR("delete_tree_item returned %08lx\n", Status);
1339  return Status;
1340  }
1341 
1342  if (!c)
1344 
1345  if (c) {
1347 
1348  c->used -= tp->item->key.offset;
1349 
1351 
1353  }
1354 
1355  ei = (EXTENT_ITEM*)tp->item->data;
1356  inline_rc = 0;
1357 
1358  len = tp->item->size - sizeof(EXTENT_ITEM);
1359  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1360 
1361  while (len > 0) {
1362  uint8_t secttype = *ptr;
1363  uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1364 
1365  len--;
1366 
1367  if (sectlen > len) {
1368  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1369  return STATUS_INTERNAL_ERROR;
1370  }
1371 
1372  if (sectlen == 0) {
1373  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1374  return STATUS_INTERNAL_ERROR;
1375  }
1376 
1377  if (secttype == TYPE_EXTENT_DATA_REF) {
1378  EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1379 
1380  inline_rc += edr->count;
1381 
1382  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1383  if (!NT_SUCCESS(Status)) {
1384  ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1385  return Status;
1386  }
1387  } else if (secttype == TYPE_SHARED_DATA_REF) {
1388  metadata_reloc* mr;
1390 
1392  if (!ref) {
1393  ERR("out of memory\n");
1395  }
1396 
1397  ref->type = TYPE_SHARED_DATA_REF;
1398  RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1399  inline_rc += ref->sdr.count;
1400 
1401  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1402  if (!NT_SUCCESS(Status)) {
1403  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1404  ExFreePool(ref);
1405  return Status;
1406  }
1407 
1408  ref->parent = mr;
1409 
1410  InsertTailList(&dr->refs, &ref->list_entry);
1411  } else {
1412  ERR("unexpected tree type %x\n", secttype);
1413  return STATUS_INTERNAL_ERROR;
1414  }
1415 
1416 
1417  len -= sectlen;
1418  ptr += sizeof(uint8_t) + sectlen;
1419  }
1420 
1421  if (inline_rc < ei->refcount) { // look for non-inline entries
1422  traverse_ptr tp2 = *tp, next_tp;
1423 
1424  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1425  tp2 = next_tp;
1426 
1427  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1428  if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1429  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1430  if (!NT_SUCCESS(Status)) {
1431  ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1432  return Status;
1433  }
1434 
1435  Status = delete_tree_item(Vcb, &tp2);
1436  if (!NT_SUCCESS(Status)) {
1437  ERR("delete_tree_item returned %08lx\n", Status);
1438  return Status;
1439  }
1440  } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1441  metadata_reloc* mr;
1443 
1445  if (!ref) {
1446  ERR("out of memory\n");
1448  }
1449 
1450  ref->type = TYPE_SHARED_DATA_REF;
1451  ref->sdr.offset = tp2.item->key.offset;
1452  ref->sdr.count = *((uint32_t*)tp2.item->data);
1453 
1454  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1455  if (!NT_SUCCESS(Status)) {
1456  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1457  ExFreePool(ref);
1458  return Status;
1459  }
1460 
1461  ref->parent = mr;
1462  InsertTailList(&dr->refs, &ref->list_entry);
1463 
1464  Status = delete_tree_item(Vcb, &tp2);
1465  if (!NT_SUCCESS(Status)) {
1466  ERR("delete_tree_item returned %08lx\n", Status);
1467  return Status;
1468  }
1469  }
1470  } else
1471  break;
1472  }
1473  }
1474 
1476 
1477  return STATUS_SUCCESS;
1478 }
1479 
1481  LIST_ENTRY newlist, *le;
1482 
1483  if (IsListEmpty(&dr->refs))
1484  return;
1485 
1486  // insertion sort
1487 
1488  InitializeListHead(&newlist);
1489 
1490  while (!IsListEmpty(&dr->refs)) {
1492  bool inserted = false;
1493 
1494  if (ref->type == TYPE_EXTENT_DATA_REF)
1495  ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1496  else if (ref->type == TYPE_SHARED_DATA_REF)
1497  ref->hash = ref->parent->new_address;
1498 
1499  le = newlist.Flink;
1500  while (le != &newlist) {
1502 
1503  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1505  inserted = true;
1506  break;
1507  }
1508 
1509  le = le->Flink;
1510  }
1511 
1512  if (!inserted)
1513  InsertTailList(&newlist, &ref->list_entry);
1514  }
1515 
1516  le = newlist.Flink;
1517  while (le != &newlist) {
1519 
1520  if (le->Flink != &newlist) {
1522 
1523  if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1524  ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1525  RemoveEntryList(&ref2->list_entry);
1526  ref->edr.count += ref2->edr.count;
1527  ExFreePool(ref2);
1528  continue;
1529  }
1530  }
1531 
1532  le = le->Flink;
1533  }
1534 
1535  newlist.Flink->Blink = &dr->refs;
1536  newlist.Blink->Flink = &dr->refs;
1537  dr->refs.Flink = newlist.Flink;
1538  dr->refs.Blink = newlist.Blink;
1539 }
1540 
1542  NTSTATUS Status;
1543  LIST_ENTRY* le;
1544  uint64_t rc = 0;
1545  uint16_t inline_len;
1546  bool all_inline = true;
1547  data_reloc_ref* first_noninline = NULL;
1548  EXTENT_ITEM* ei;
1549  uint8_t* ptr;
1550 
1551  inline_len = sizeof(EXTENT_ITEM);
1552 
1554 
1555  le = dr->refs.Flink;
1556  while (le != &dr->refs) {
1558  uint16_t extlen = 0;
1559 
1560  if (ref->type == TYPE_EXTENT_DATA_REF) {
1561  extlen += sizeof(EXTENT_DATA_REF);
1562  rc += ref->edr.count;
1563  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1564  extlen += sizeof(SHARED_DATA_REF);
1565  rc++;
1566  }
1567 
1568  if (all_inline) {
1569  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1570  all_inline = false;
1571  first_noninline = ref;
1572  } else
1573  inline_len += extlen + 1;
1574  }
1575 
1576  le = le->Flink;
1577  }
1578 
1579  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1580  if (!ei) {
1581  ERR("out of memory\n");
1583  }
1584 
1585  ei->refcount = rc;
1586  ei->generation = dr->ei->generation;
1587  ei->flags = dr->ei->flags;
1588  ptr = (uint8_t*)&ei[1];
1589 
1590  le = dr->refs.Flink;
1591  while (le != &dr->refs) {
1593 
1594  if (ref == first_noninline)
1595  break;
1596 
1597  *ptr = ref->type;
1598  ptr++;
1599 
1600  if (ref->type == TYPE_EXTENT_DATA_REF) {
1602 
1603  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1604 
1605  ptr += sizeof(EXTENT_DATA_REF);
1606  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1608 
1609  sdr->offset = ref->parent->new_address;
1610  sdr->count = ref->sdr.count;
1611 
1612  ptr += sizeof(SHARED_DATA_REF);
1613  }
1614 
1615  le = le->Flink;
1616  }
1617 
1618  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1619  if (!NT_SUCCESS(Status)) {
1620  ERR("insert_tree_item returned %08lx\n", Status);
1621  return Status;
1622  }
1623 
1624  if (!all_inline) {
1625  le = &first_noninline->list_entry;
1626 
1627  while (le != &dr->refs) {
1629 
1630  if (ref->type == TYPE_EXTENT_DATA_REF) {
1631  EXTENT_DATA_REF* edr;
1632 
1634  if (!edr) {
1635  ERR("out of memory\n");
1637  }
1638 
1639  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1640 
1641  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1642  if (!NT_SUCCESS(Status)) {
1643  ERR("insert_tree_item returned %08lx\n", Status);
1644  return Status;
1645  }
1646  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1647  uint32_t* sdr;
1648 
1650  if (!sdr) {
1651  ERR("out of memory\n");
1653  }
1654 
1655  *sdr = ref->sdr.count;
1656 
1657  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1658  if (!NT_SUCCESS(Status)) {
1659  ERR("insert_tree_item returned %08lx\n", Status);
1660  return Status;
1661  }
1662  }
1663 
1664  le = le->Flink;
1665  }
1666  }
1667 
1668  return STATUS_SUCCESS;
1669 }
1670 
1672  KEY searchkey;
1673  traverse_ptr tp;
1674  NTSTATUS Status;
1675  bool b;
1676  LIST_ENTRY items, metadata_items, rollback, *le;
1677  uint64_t loaded = 0, num_loaded = 0;
1678  chunk* newchunk = NULL;
1679  uint8_t* data = NULL;
1680 
1681  TRACE("chunk %I64x\n", c->offset);
1682 
1685  InitializeListHead(&metadata_items);
1686 
1687  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1688 
1689  searchkey.obj_id = c->offset;
1690  searchkey.obj_type = TYPE_EXTENT_ITEM;
1691  searchkey.offset = 0xffffffffffffffff;
1692 
1693  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1694  if (!NT_SUCCESS(Status)) {
1695  ERR("find_item returned %08lx\n", Status);
1696  goto end;
1697  }
1698 
1699  do {
1700  traverse_ptr next_tp;
1701 
1702  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1703  break;
1704 
1705  if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1706  bool tree = false;
1707 
1708  if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1709  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1710 
1712  tree = true;
1713  }
1714 
1715  if (!tree) {
1716  Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1717 
1718  if (!NT_SUCCESS(Status)) {
1719  ERR("add_data_reloc returned %08lx\n", Status);
1720  goto end;
1721  }
1722 
1723  loaded += tp.item->key.offset;
1724  num_loaded++;
1725 
1726  if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1727  break;
1728  }
1729  }
1730 
1731  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1732 
1733  if (b)
1734  tp = next_tp;
1735  } while (b);
1736 
1737  if (IsListEmpty(&items)) {
1738  *changed = false;
1740  goto end;
1741  } else
1742  *changed = true;
1743 
1745  if (!data) {
1746  ERR("out of memory\n");
1748  goto end;
1749  }
1750 
1751  le = items.Flink;
1752  while (le != &items) {
1754  bool done = false;
1755  LIST_ENTRY* le2;
1756  void* csum;
1757  RTL_BITMAP bmp;
1758  ULONG* bmparr;
1759  ULONG bmplen, runlength, index, lastoff;
1760 
1761  if (newchunk) {
1762  acquire_chunk_lock(newchunk, Vcb);
1763 
1764  if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1765  newchunk->used += dr->size;
1766  space_list_subtract(newchunk, dr->new_address, dr->size, &rollback);
1767  done = true;
1768  }
1769 
1770  release_chunk_lock(newchunk, Vcb);
1771  }
1772 
1773  if (!done) {
1774  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1775 
1776  le2 = Vcb->chunks.Flink;
1777  while (le2 != &Vcb->chunks) {
1778  chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1779 
1780  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1781  acquire_chunk_lock(c2, Vcb);
1782 
1783  if ((c2->chunk_item->size - c2->used) >= dr->size) {
1784  if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1785  c2->used += dr->size;
1786  space_list_subtract(c2, dr->new_address, dr->size, &rollback);
1787  release_chunk_lock(c2, Vcb);
1788  newchunk = c2;
1789  done = true;
1790  break;
1791  }
1792  }
1793 
1794  release_chunk_lock(c2, Vcb);
1795  }
1796 
1797  le2 = le2->Flink;
1798  }
1799 
1800  // allocate new chunk if necessary
1801  if (!done) {
1802  Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1803 
1804  if (!NT_SUCCESS(Status)) {
1805  ERR("alloc_chunk returned %08lx\n", Status);
1806  ExReleaseResourceLite(&Vcb->chunk_lock);
1807  goto end;
1808  }
1809 
1810  acquire_chunk_lock(newchunk, Vcb);
1811 
1812  newchunk->balance_num = Vcb->balance.balance_num;
1813 
1814  if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1815  release_chunk_lock(newchunk, Vcb);
1816  ExReleaseResourceLite(&Vcb->chunk_lock);
1817  ERR("could not find address in new chunk\n");
1819  goto end;
1820  } else {
1821  newchunk->used += dr->size;
1822  space_list_subtract(newchunk, dr->new_address, dr->size, &rollback);
1823  }
1824 
1825  release_chunk_lock(newchunk, Vcb);
1826  }
1827 
1828  ExReleaseResourceLite(&Vcb->chunk_lock);
1829  }
1830 
1831  dr->newchunk = newchunk;
1832 
1833  bmplen = (ULONG)(dr->size >> Vcb->sector_shift);
1834 
1835  bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1836  if (!bmparr) {
1837  ERR("out of memory\n");
1839  goto end;
1840  }
1841 
1842  csum = ExAllocatePoolWithTag(PagedPool, (ULONG)((dr->size * Vcb->csum_size) >> Vcb->sector_shift), ALLOC_TAG);
1843  if (!csum) {
1844  ERR("out of memory\n");
1845  ExFreePool(bmparr);
1847  goto end;
1848  }
1849 
1850  RtlInitializeBitMap(&bmp, bmparr, bmplen);
1851  RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1852 
1853  searchkey.obj_id = EXTENT_CSUM_ID;
1854  searchkey.obj_type = TYPE_EXTENT_CSUM;
1855  searchkey.offset = dr->address;
1856 
1857  Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1858  if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1859  ERR("find_item returned %08lx\n", Status);
1860  ExFreePool(csum);
1861  ExFreePool(bmparr);
1862  goto end;
1863  }
1864 
1865  if (Status != STATUS_NOT_FOUND) {
1866  do {
1867  traverse_ptr next_tp;
1868 
1869  if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1870  if (tp.item->key.offset >= dr->address + dr->size)
1871  break;
1872  else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size) >= dr->address) {
1873  uint64_t cs = max(dr->address, tp.item->key.offset);
1874  uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size));
1875 
1876  RtlCopyMemory((uint8_t*)csum + (((cs - dr->address) * Vcb->csum_size) >> Vcb->sector_shift),
1877  tp.item->data + (((cs - tp.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift),
1878  (ULONG)(((ce - cs) * Vcb->csum_size) >> Vcb->sector_shift));
1879 
1880  RtlClearBits(&bmp, (ULONG)((cs - dr->address) >> Vcb->sector_shift), (ULONG)((ce - cs) >> Vcb->sector_shift));
1881 
1882  if (ce == dr->address + dr->size)
1883  break;
1884  }
1885  }
1886 
1887  if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1888  tp = next_tp;
1889  else
1890  break;
1891  } while (true);
1892  }
1893 
1894  lastoff = 0;
1895  runlength = RtlFindFirstRunClear(&bmp, &index);
1896 
1897  while (runlength != 0) {
1898  if (index >= bmplen)
1899  break;
1900 
1901  if (index + runlength >= bmplen) {
1902  runlength = bmplen - index;
1903 
1904  if (runlength == 0)
1905  break;
1906  }
1907 
1908  if (index > lastoff) {
1909  ULONG off = lastoff;
1910  ULONG size = index - lastoff;
1911 
1912  // handle no csum run
1913  do {
1914  ULONG rl;
1915 
1916  if (size << Vcb->sector_shift > BALANCE_UNIT)
1917  rl = BALANCE_UNIT >> Vcb->sector_shift;
1918  else
1919  rl = size;
1920 
1921  Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data,
1922  c, NULL, NULL, 0, false, NormalPagePriority);
1923  if (!NT_SUCCESS(Status)) {
1924  ERR("read_data returned %08lx\n", Status);
1925  ExFreePool(csum);
1926  ExFreePool(bmparr);
1927  goto end;
1928  }
1929 
1930  Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift,
1931  NULL, newchunk, false, 0, NormalPagePriority);
1932  if (!NT_SUCCESS(Status)) {
1933  ERR("write_data_complete returned %08lx\n", Status);
1934  ExFreePool(csum);
1935  ExFreePool(bmparr);
1936  goto end;
1937  }
1938 
1939  size -= rl;
1940  off += rl;
1941  } while (size > 0);
1942  }
1943 
1944  add_checksum_entry(Vcb, dr->new_address + (index << Vcb->sector_shift), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL);
1945  add_checksum_entry(Vcb, dr->address + (index << Vcb->sector_shift), runlength, NULL, NULL);
1946 
1947  // handle csum run
1948  do {
1949  ULONG rl;
1950 
1951  if (runlength << Vcb->sector_shift > BALANCE_UNIT)
1952  rl = BALANCE_UNIT >> Vcb->sector_shift;
1953  else
1954  rl = runlength;
1955 
1956  Status = read_data(Vcb, dr->address + (index << Vcb->sector_shift), rl << Vcb->sector_shift,
1957  (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority);
1958  if (!NT_SUCCESS(Status)) {
1959  ERR("read_data returned %08lx\n", Status);
1960  ExFreePool(csum);
1961  ExFreePool(bmparr);
1962  goto end;
1963  }
1964 
1965  Status = write_data_complete(Vcb, dr->new_address + (index << Vcb->sector_shift), data, rl << Vcb->sector_shift,
1966  NULL, newchunk, false, 0, NormalPagePriority);
1967  if (!NT_SUCCESS(Status)) {
1968  ERR("write_data_complete returned %08lx\n", Status);
1969  ExFreePool(csum);
1970  ExFreePool(bmparr);
1971  goto end;
1972  }
1973 
1974  runlength -= rl;
1975  index += rl;
1976  } while (runlength > 0);
1977 
1978  lastoff = index;
1979  runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1980  }
1981 
1982  ExFreePool(csum);
1983  ExFreePool(bmparr);
1984 
1985  // handle final nocsum run
1986  if (lastoff < dr->size >> Vcb->sector_shift) {
1987  ULONG off = lastoff;
1988  ULONG size = (ULONG)((dr->size >> Vcb->sector_shift) - lastoff);
1989 
1990  do {
1991  ULONG rl;
1992 
1993  if (size << Vcb->sector_shift > BALANCE_UNIT)
1994  rl = BALANCE_UNIT >> Vcb->sector_shift;
1995  else
1996  rl = size;
1997 
1998  Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data,
1999  c, NULL, NULL, 0, false, NormalPagePriority);
2000  if (!NT_SUCCESS(Status)) {
2001  ERR("read_data returned %08lx\n", Status);
2002  goto end;
2003  }
2004 
2005  Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift,
2006  NULL, newchunk, false, 0, NormalPagePriority);
2007  if (!NT_SUCCESS(Status)) {
2008  ERR("write_data_complete returned %08lx\n", Status);
2009  goto end;
2010  }
2011 
2012  size -= rl;
2013  off += rl;
2014  } while (size > 0);
2015  }
2016 
2017  le = le->Flink;
2018  }
2019 
2020  ExFreePool(data);
2021  data = NULL;
2022 
2023  Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2024  if (!NT_SUCCESS(Status)) {
2025  ERR("write_metadata_items returned %08lx\n", Status);
2026  goto end;
2027  }
2028 
2029  le = items.Flink;
2030  while (le != &items) {
2032 
2034  if (!NT_SUCCESS(Status)) {
2035  ERR("add_data_reloc_extent_item returned %08lx\n", Status);
2036  goto end;
2037  }
2038 
2039  le = le->Flink;
2040  }
2041 
2042  le = c->changed_extents.Flink;
2043  while (le != &c->changed_extents) {
2044  LIST_ENTRY *le2, *le3;
2046 
2047  le3 = le->Flink;
2048 
2049  le2 = items.Flink;
2050  while (le2 != &items) {
2052 
2053  if (ce->address == dr->address) {
2054  ce->address = dr->new_address;
2057  break;
2058  }
2059 
2060  le2 = le2->Flink;
2061  }
2062 
2063  le = le3;
2064  }
2065 
2067 
2068  Vcb->need_write = true;
2069 
2070 end:
2071  if (NT_SUCCESS(Status)) {
2072  // update extents in cache inodes before we flush
2073  le = Vcb->chunks.Flink;
2074  while (le != &Vcb->chunks) {
2076 
2077  if (c2->cache) {
2078  LIST_ENTRY* le2;
2079 
2080  ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2081 
2082  le2 = c2->cache->extents.Flink;
2083  while (le2 != &c2->cache->extents) {
2085 
2086  if (!ext->ignore) {
2087  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2088  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2089 
2090  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2091  LIST_ENTRY* le3 = items.Flink;
2092  while (le3 != &items) {
2094 
2095  if (ed2->address == dr->address) {
2096  ed2->address = dr->new_address;
2097  break;
2098  }
2099 
2100  le3 = le3->Flink;
2101  }
2102  }
2103  }
2104  }
2105 
2106  le2 = le2->Flink;
2107  }
2108 
2109  ExReleaseResourceLite(c2->cache->Header.Resource);
2110  }
2111 
2112  le = le->Flink;
2113  }
2114 
2115  Status = do_write(Vcb, NULL);
2116  if (!NT_SUCCESS(Status))
2117  ERR("do_write returned %08lx\n", Status);
2118  }
2119 
2120  if (NT_SUCCESS(Status)) {
2122 
2123  // update open FCBs
2124  // FIXME - speed this up(?)
2125 
2126  le = Vcb->all_fcbs.Flink;
2127  while (le != &Vcb->all_fcbs) {
2128  struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2129  LIST_ENTRY* le2;
2130 
2131  ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2132 
2133  le2 = fcb->extents.Flink;
2134  while (le2 != &fcb->extents) {
2136 
2137  if (!ext->ignore) {
2138  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2139  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2140 
2141  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2142  LIST_ENTRY* le3 = items.Flink;
2143  while (le3 != &items) {
2145 
2146  if (ed2->address == dr->address) {
2147  ed2->address = dr->new_address;
2148  break;
2149  }
2150 
2151  le3 = le3->Flink;
2152  }
2153  }
2154  }
2155  }
2156 
2157  le2 = le2->Flink;
2158  }
2159 
2160  ExReleaseResourceLite(fcb->Header.Resource);
2161 
2162  le = le->Flink;
2163  }
2164  } else
2166 
2167  free_trees(Vcb);
2168 
2169  ExReleaseResourceLite(&Vcb->tree_lock);
2170 
2171  if (data)
2172  ExFreePool(data);
2173 
2174  while (!IsListEmpty(&items)) {
2176 
2177  while (!IsListEmpty(&dr->refs)) {
2179 
2180  ExFreePool(ref);
2181  }
2182 
2183  ExFreePool(dr);
2184  }
2185 
2186  while (!IsListEmpty(&metadata_items)) {
2188 
2189  while (!IsListEmpty(&mr->refs)) {
2191 
2192  ExFreePool(ref);
2193  }
2194 
2195  if (mr->data)
2196  ExFreePool(mr->data);
2197 
2198  ExFreePool(mr);
2199  }
2200 
2201  return Status;
2202 }
2203 
2204 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2205  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2206  return BLOCK_FLAG_RAID0;
2207  else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2208  return BLOCK_FLAG_RAID1;
2209  else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2210  return BLOCK_FLAG_DUPLICATE;
2211  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2212  return BLOCK_FLAG_RAID10;
2213  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2214  return BLOCK_FLAG_RAID5;
2215  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2216  return BLOCK_FLAG_RAID6;
2217  else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2218  return BLOCK_FLAG_RAID1C3;
2219  else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2220  return BLOCK_FLAG_RAID1C4;
2221  else
2222  return BLOCK_FLAG_SINGLE;
2223 }
2224 
2226  btrfs_balance_opts* opts;
2227 
2228  opts = &Vcb->balance.opts[sort];
2229 
2230  if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2231  return false;
2232 
2233  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2235 
2236  if (!(type & opts->profiles))
2237  return false;
2238  }
2239 
2240  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2241  uint16_t i;
2242  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2243  bool b = false;
2244 
2245  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2246  if (cis[i].dev_id == opts->devid) {
2247  b = true;
2248  break;
2249  }
2250  }
2251 
2252  if (!b)
2253  return false;
2254  }
2255 
2256  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2257  uint16_t i, factor;
2258  uint64_t physsize;
2259  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2260  bool b = false;
2261 
2262  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2263  factor = c->chunk_item->num_stripes;
2264  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2265  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2266  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2267  factor = c->chunk_item->num_stripes - 1;
2268  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2269  factor = c->chunk_item->num_stripes - 2;
2270  else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4
2271  factor = 1;
2272 
2273  physsize = c->chunk_item->size / factor;
2274 
2275  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2276  if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2277  (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2278  b = true;
2279  break;
2280  }
2281  }
2282 
2283  if (!b)
2284  return false;
2285  }
2286 
2287  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2288  if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2289  return false;
2290  }
2291 
2292  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2293  if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2294  return false;
2295  }
2296 
2297  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2298  uint64_t usage = c->used * 100 / c->chunk_item->size;
2299 
2300  // usage == 0 should mean completely empty, not just that usage rounds to 0%
2301  if (c->used > 0 && usage == 0)
2302  usage = 1;
2303 
2304  if (usage < opts->usage_start || usage > opts->usage_end)
2305  return false;
2306  }
2307 
2310 
2311  if (type == opts->convert)
2312  return false;
2313  }
2314 
2315  return true;
2316 }
2317 
2319  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2320  args->profiles = opts->profiles;
2322  }
2323 
2324  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2325  if (args->usage_start == 0) {
2327  args->usage_start = opts->usage_start;
2328  args->usage_end = opts->usage_end;
2329  } else {
2330  args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2331  args->usage = opts->usage_end;
2332  }
2333  }
2334 
2335  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2336  args->devid = opts->devid;
2337  args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2338  }
2339 
2340  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2341  args->drange_start = opts->drange_start;
2342  args->drange_end = opts->drange_end;
2343  args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2344  }
2345 
2346  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2347  args->vrange_start = opts->vrange_start;
2348  args->vrange_end = opts->vrange_end;
2349  args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2350  }
2351 
2352  if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2353  args->convert = opts->convert;
2354  args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2355 
2356  if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2357  args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2358  }
2359 
2360  if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2361  if (args->limit_start == 0) {
2363  args->limit_start = (uint32_t)opts->limit_start;
2364  args->limit_end = (uint32_t)opts->limit_end;
2365  } else {
2366  args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2367  args->limit = opts->limit_end;
2368  }
2369  }
2370 
2371  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2372  args->stripes_start = opts->stripes_start;
2373  args->stripes_end = opts->stripes_end;
2375  }
2376 }
2377 
2379  KEY searchkey;
2380  traverse_ptr tp;
2381  NTSTATUS Status;
2382  BALANCE_ITEM* bi;
2383 
2384  searchkey.obj_id = BALANCE_ITEM_ID;
2385  searchkey.obj_type = TYPE_TEMP_ITEM;
2386  searchkey.offset = 0;
2387 
2388  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2389 
2390  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2391  if (!NT_SUCCESS(Status)) {
2392  ERR("find_item returned %08lx\n", Status);
2393  goto end;
2394  }
2395 
2396  if (!keycmp(tp.item->key, searchkey)) {
2398  if (!NT_SUCCESS(Status)) {
2399  ERR("delete_tree_item returned %08lx\n", Status);
2400  goto end;
2401  }
2402  }
2403 
2405  if (!bi) {
2406  ERR("out of memory\n");
2408  goto end;
2409  }
2410 
2411  RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2412 
2413  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2414  bi->flags |= BALANCE_FLAGS_DATA;
2415  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2416  }
2417 
2418  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2420  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2421  }
2422 
2423  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2424  bi->flags |= BALANCE_FLAGS_SYSTEM;
2425  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2426  }
2427 
2428  Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2429  if (!NT_SUCCESS(Status)) {
2430  ERR("insert_tree_item returned %08lx\n", Status);
2431  ExFreePool(bi);
2432  goto end;
2433  }
2434 
2436 
2437 end:
2438  if (NT_SUCCESS(Status)) {
2439  Status = do_write(Vcb, NULL);
2440  if (!NT_SUCCESS(Status))
2441  ERR("do_write returned %08lx\n", Status);
2442  }
2443 
2444  free_trees(Vcb);
2445 
2446  ExReleaseResourceLite(&Vcb->tree_lock);
2447 
2448  return Status;
2449 }
2450 
2452  KEY searchkey;
2453  traverse_ptr tp;
2454  NTSTATUS Status;
2455 
2456  searchkey.obj_id = BALANCE_ITEM_ID;
2457  searchkey.obj_type = TYPE_TEMP_ITEM;
2458  searchkey.offset = 0;
2459 
2460  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2461 
2462  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2463  if (!NT_SUCCESS(Status)) {
2464  ERR("find_item returned %08lx\n", Status);
2465  goto end;
2466  }
2467 
2468  if (!keycmp(tp.item->key, searchkey)) {
2470  if (!NT_SUCCESS(Status)) {
2471  ERR("delete_tree_item returned %08lx\n", Status);
2472  goto end;
2473  }
2474 
2475  Status = do_write(Vcb, NULL);
2476  if (!NT_SUCCESS(Status)) {
2477  ERR("do_write returned %08lx\n", Status);
2478  goto end;
2479  }
2480 
2481  free_trees(Vcb);
2482  }
2483 
2485 
2486 end:
2487  ExReleaseResourceLite(&Vcb->tree_lock);
2488 
2489  return Status;
2490 }
2491 
2494 
2495  if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2497  opts->profiles = args->profiles;
2498  }
2499 
2500  if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2502 
2503  opts->usage_start = 0;
2504  opts->usage_end = (uint8_t)args->usage;
2505  } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2507 
2508  opts->usage_start = (uint8_t)args->usage_start;
2509  opts->usage_end = (uint8_t)args->usage_end;
2510  }
2511 
2512  if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2514  opts->devid = args->devid;
2515  }
2516 
2517  if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2519  opts->drange_start = args->drange_start;
2520  opts->drange_end = args->drange_end;
2521  }
2522 
2523  if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2525  opts->vrange_start = args->vrange_start;
2526  opts->vrange_end = args->vrange_end;
2527  }
2528 
2529  if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2531 
2532  opts->limit_start = 0;
2533  opts->limit_end = args->limit;
2534  } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2536 
2537  opts->limit_start = args->limit_start;
2538  opts->limit_end = args->limit_end;
2539  }
2540 
2541  if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2543 
2544  opts->stripes_start = (uint16_t)args->stripes_start;
2545  opts->stripes_end = (uint16_t)args->stripes_end;
2546  }
2547 
2548  if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2550  opts->convert = args->convert;
2551 
2552  if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2553  opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2554  }
2555 }
2556 
2558  NTSTATUS Status;
2559  superblock* sb;
2560  int i = 0;
2561 
2563  if (!sb) {
2564  ERR("out of memory\n");
2566  }
2567 
2568  RtlZeroMemory(sb, sizeof(superblock));
2569 
2570  while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2571  Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2572 
2573  if (!NT_SUCCESS(Status)) {
2574  ExFreePool(sb);
2575  return Status;
2576  }
2577 
2578  i++;
2579  }
2580 
2581  ExFreePool(sb);
2582 
2583  return STATUS_SUCCESS;
2584 }
2585 
2587  KEY searchkey;
2588  traverse_ptr tp;
2589  NTSTATUS Status;
2590  LIST_ENTRY* le;
2592 
2593  if (Vcb->need_write) {
2594  Status = do_write(Vcb, NULL);
2595 
2596  if (!NT_SUCCESS(Status))
2597  ERR("do_write returned %08lx\n", Status);
2598  } else
2600 
2601  free_trees(Vcb);
2602 
2603  if (!NT_SUCCESS(Status))
2604  return Status;
2605 
2606  // remove entry in chunk tree
2607 
2608  searchkey.obj_id = 1;
2609  searchkey.obj_type = TYPE_DEV_ITEM;
2610  searchkey.offset = dev->devitem.dev_id;
2611 
2612  Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2613  if (!NT_SUCCESS(Status)) {
2614  ERR("find_item returned %08lx\n", Status);
2615  return Status;
2616  }
2617 
2618  if (!keycmp(searchkey, tp.item->key)) {
2620 
2621  if (!NT_SUCCESS(Status)) {
2622  ERR("delete_tree_item returned %08lx\n", Status);
2623  return Status;
2624  }
2625  }
2626 
2627  // remove stats entry in device tree
2628 
2629  searchkey.obj_id = 0;
2630  searchkey.obj_type = TYPE_DEV_STATS;
2631  searchkey.offset = dev->devitem.dev_id;
2632 
2633  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2634  if (!NT_SUCCESS(Status)) {
2635  ERR("find_item returned %08lx\n", Status);
2636  return Status;
2637  }
2638 
2639  if (!keycmp(searchkey, tp.item->key)) {
2641 
2642  if (!NT_SUCCESS(Status)) {
2643  ERR("delete_tree_item returned %08lx\n", Status);
2644  return Status;
2645  }
2646  }
2647 
2648  // update superblock
2649 
2650  Vcb->superblock.num_devices--;
2651  Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2652  Vcb->devices_loaded--;
2653 
2654  RemoveEntryList(&dev->list_entry);
2655 
2656  // flush
2657 
2658  Status = do_write(Vcb, NULL);
2659  if (!NT_SUCCESS(Status))
2660  ERR("do_write returned %08lx\n", Status);
2661 
2662  free_trees(Vcb);
2663 
2664  if (!NT_SUCCESS(Status))
2665  return Status;
2666 
2667  if (!dev->readonly && dev->devobj) {
2669  if (!NT_SUCCESS(Status))
2670  WARN("remove_superblocks returned %08lx\n", Status);
2671  }
2672 
2673  // remove entry in volume list
2674 
2675  vde = Vcb->vde;
2676 
2677  if (dev->devobj) {
2678  pdo_device_extension* pdode = vde->pdode;
2679 
2681 
2682  le = pdode->children.Flink;
2683  while (le != &pdode->children) {
2685 
2686  if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2689  UNICODE_STRING mmdevpath;
2690 
2691  pdode->children_loaded--;
2692 
2693  if (vc->had_drive_letter) { // re-add entry to mountmgr
2696  if (!NT_SUCCESS(Status))
2697  ERR("IoGetDeviceObjectPointer returned %08lx\n", Status);
2698  else {
2699  MOUNTDEV_NAME mdn;
2700 
2701  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2703  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2704  else {
2705  MOUNTDEV_NAME* mdn2;
2706  ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2707 
2708  mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2709  if (!mdn2)
2710  ERR("out of memory\n");
2711  else {
2712  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2713  if (!NT_SUCCESS(Status))
2714  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2715  else {
2717 
2718  name.Buffer = mdn2->Name;
2719  name.Length = name.MaximumLength = mdn2->NameLength;
2720 
2722  if (!NT_SUCCESS(Status))
2723  WARN("mountmgr_add_drive_letter returned %08lx\n", Status);
2724  }
2725 
2726  ExFreePool(mdn2);
2727  }
2728  }
2729 
2731  }
2732  }
2733 
2734  ExFreePool(vc->pnp_name.Buffer);
2736  ExFreePool(vc);
2737 
2739 
2740  break;
2741  }
2742 
2743  le = le->Flink;
2744  }
2745 
2746  if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2747  vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2748 
2749  le = pdode->children.Flink;
2750  while (le != &pdode->children) {
2752 
2753  if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2754  vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2755  break;
2756  }
2757 
2758  le = le->Flink;
2759  }
2760  }
2761 
2762  pdode->num_children = Vcb->superblock.num_devices;
2763 
2765 
2766  // free dev
2767 
2768  if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2770  }
2771 
2772  while (!IsListEmpty(&dev->space)) {
2773  LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2775 
2776  ExFreePool(s);
2777  }
2778 
2779  ExFreePool(dev);
2780 
2781  if (Vcb->trim) {
2782  Vcb->trim = false;
2783 
2784  le = Vcb->devices.Flink;
2785  while (le != &Vcb->devices) {
2786  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2787 
2788  if (dev2->trim) {
2789  Vcb->trim = true;
2790  break;
2791  }
2792 
2793  le = le->Flink;
2794  }
2795  }
2796 
2798 
2799  return STATUS_SUCCESS;
2800 }
2801 
2804  DEVICE_DATA_SET_RANGE* ranges;
2805  ULONG datalen, i;
2806  KEY searchkey;
2807  traverse_ptr tp;
2808  NTSTATUS Status;
2809  bool b;
2810  uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2811  LIST_ENTRY* le;
2812 
2813  dev->num_trim_entries = 0;
2814 
2815  searchkey.obj_id = dev->devitem.dev_id;
2816  searchkey.obj_type = TYPE_DEV_EXTENT;
2817  searchkey.offset = 0;
2818 
2819  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2820  if (!NT_SUCCESS(Status)) {
2821  ERR("find_item returned %08lx\n", Status);
2822  return;
2823  }
2824 
2825  do {
2826  traverse_ptr next_tp;
2827 
2828  if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2829  if (tp.item->size >= sizeof(DEV_EXTENT)) {
2830  DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2831 
2832  if (tp.item->key.offset > lastoff)
2833  add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2834 
2835  lastoff = tp.item->key.offset + de->length;
2836  } else {
2837  ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2838  return;
2839  }
2840  }
2841 
2842  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2843 
2844  if (b) {
2845  tp = next_tp;
2846  if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2847  break;
2848  }
2849  } while (b);
2850 
2851  if (lastoff < dev->devitem.num_bytes)
2852  add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2853 
2854  if (dev->num_trim_entries == 0)
2855  return;
2856 
2857  datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2858 
2860  if (!dmdsa) {
2861  ERR("out of memory\n");
2862  goto end;
2863  }
2864 
2865  dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2866  dmdsa->Action = DeviceDsmAction_Trim;
2868  dmdsa->ParameterBlockOffset = 0;
2869  dmdsa->ParameterBlockLength = 0;
2871  dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2872 
2873  ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2874 
2875  i = 0;
2876  le = dev->trim_list.Flink;
2877  while (le != &dev->trim_list) {
2879 
2880  ranges[i].StartingOffset = s->address;
2881  ranges[i].LengthInBytes = s->size;
2882  i++;
2883 
2884  le = le->Flink;
2885  }
2886 
2888  if (!NT_SUCCESS(Status))
2889  WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status);
2890 
2891  ExFreePool(dmdsa);
2892 
2893 end:
2894  while (!IsListEmpty(&dev->trim_list)) {
2896  ExFreePool(s);
2897  }
2898 
2899  dev->num_trim_entries = 0;
2900 }
2901 
2903  NTSTATUS Status;
2904  bool changed;
2905  LIST_ENTRY* le;
2906  chunk* rc;
2907 
2908  // FIXME - allow with metadata chunks?
2909 
2910  while (true) {
2911  rc = NULL;
2912 
2913  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2914 
2915  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2916 
2917  // choose the least-used chunk we haven't looked at yet
2918  le = Vcb->chunks.Flink;
2919  while (le != &Vcb->chunks) {
2921 
2922  // FIXME - skip full-size chunks over e.g. 90% full?
2923  if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2924  rc = c;
2925 
2926  le = le->Flink;
2927  }
2928 
2929  ExReleaseResourceLite(&Vcb->chunk_lock);
2930 
2931  if (!rc) {
2932  ExReleaseResourceLite(&Vcb->tree_lock);
2933  break;
2934  }
2935 
2936  if (rc->list_entry_balance.Flink) {
2938  Vcb->balance.chunks_left--;
2939  }
2940 
2941  rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2942  rc->reloc = true;
2943 
2944  ExReleaseResourceLite(&Vcb->tree_lock);
2945 
2946  do {
2947  changed = false;
2948 
2949  Status = balance_data_chunk(Vcb, rc, &changed);
2950  if (!NT_SUCCESS(Status)) {
2951  ERR("balance_data_chunk returned %08lx\n", Status);
2952  Vcb->balance.status = Status;
2954  rc->reloc = false;
2955  return Status;
2956  }
2957 
2958  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2959 
2960  if (Vcb->readonly)
2961  Vcb->balance.stopping = true;
2962 
2963  if (Vcb->balance.stopping)
2964  return STATUS_SUCCESS;
2965  } while (changed);
2966 
2968 
2969  rc->changed = true;
2970  rc->space_changed = true;
2971  rc->balance_num = Vcb->balance.balance_num;
2972 
2973  Status = do_write(Vcb, NULL);
2974  if (!NT_SUCCESS(Status)) {
2975  ERR("do_write returned %08lx\n", Status);
2976  return Status;
2977  }
2978 
2979  free_trees(Vcb);
2980  }
2981 
2982  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2983 
2984  Status = alloc_chunk(Vcb, flags, &rc, true);
2985 
2986  ExReleaseResourceLite(&Vcb->chunk_lock);
2987 
2988  if (NT_SUCCESS(Status)) {
2989  *newchunk = rc;
2990  return Status;
2991  } else {
2992  ERR("alloc_chunk returned %08lx\n", Status);
2993  return Status;
2994  }
2995 }
2996 
2998  LIST_ENTRY* le;
2999 
3000  while (!IsListEmpty(&dev->space)) {
3002 
3003  ExFreePool(s);
3004  }
3005 
3006  // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3007 
3008  space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3009 
3010  le = Vcb->chunks.Flink;
3011  while (le != &Vcb->chunks) {
3012  uint16_t n;
3014  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3015 
3016  for (n = 0; n < c->chunk_item->num_stripes; n++) {
3017  uint64_t stripe_size = 0;
3018 
3019  if (cis[n].dev_id == dev->devitem.dev_id) {
3020  if (stripe_size == 0) {
3021  uint16_t factor;
3022 
3023  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3024  factor = c->chunk_item->num_stripes;
3025  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3026  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3027  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3028  factor = c->chunk_item->num_stripes - 1;
3029  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3030  factor = c->chunk_item->num_stripes - 2;
3031  else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4
3032  factor = 1;
3033 
3034  stripe_size = c->chunk_item->size / factor;
3035  }
3036 
3037  space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3038  }
3039  }
3040 
3041  le = le->Flink;
3042  }
3043 
3044  return STATUS_SUCCESS;
3045 }
3046 
3047 _Function_class_(KSTART_ROUTINE)
3048 void __stdcall balance_thread(void* context) {
3050  LIST_ENTRY chunks;
3051  LIST_ENTRY* le;
3052  uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3053  uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3054  NTSTATUS Status;
3055 
3056  Vcb->balance.balance_num++;
3057 
3058  Vcb->balance.stopping = false;
3059  KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3060 
3061  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3062  old_data_flags = Vcb->data_flags;
3063  Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3064 
3066  }
3067 
3068  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3069  old_metadata_flags = Vcb->metadata_flags;
3070  Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3071  }
3072 
3073  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3074  old_system_flags = Vcb->system_flags;
3075  Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3076  }
3077 
3078  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3079  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3080  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3081  else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3082  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3083  }
3084 
3085  num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3086  Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3087 
3088  InitializeListHead(&chunks);
3089 
3090  // FIXME - what are we supposed to do with limit_start?
3091 
3092  if (!Vcb->readonly) {
3093  if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3095  if (!NT_SUCCESS(Status)) {
3096  ERR("add_balance_item returned %08lx\n", Status);
3097  Vcb->balance.status = Status;
3098  goto end;
3099  }
3100  } else {
3101  if (Vcb->need_write) {
3102  Status = do_write(Vcb, NULL);
3103 
3104  free_trees(Vcb);
3105 
3106  if (!NT_SUCCESS(Status)) {
3107  ERR("do_write returned %08lx\n", Status);
3108  Vcb->balance.status = Status;
3109  goto end;
3110  }
3111  }
3112  }
3113  }
3114 
3115  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3116 
3117  if (Vcb->balance.stopping)
3118  goto end;
3119 
3120  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3121 
3122  le = Vcb->chunks.Flink;
3123  while (le != &Vcb->chunks) {
3125  uint8_t sort;
3126 
3128 
3129  if (c->chunk_item->type & BLOCK_FLAG_DATA)
3131  else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3133  else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3135  else {
3136  ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3138  break;
3139  }
3140 
3141  if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3143  InsertTailList(&chunks, &c->list_entry_balance);
3144 
3145  num_chunks[sort]++;
3146  Vcb->balance.total_chunks++;
3147  Vcb->balance.chunks_left++;
3148  } else if (sort == BALANCE_OPTS_METADATA)
3149  okay_metadata_chunks++;
3150  else if (sort == BALANCE_OPTS_DATA)
3151  okay_data_chunks++;
3152  else if (sort == BALANCE_OPTS_SYSTEM)
3153  okay_system_chunks++;
3154 
3155  if (!c->cache_loaded) {
3157 
3158  if (!NT_SUCCESS(Status)) {
3159  ERR("load_cache_chunk returned %08lx\n", Status);
3160  Vcb->balance.status = Status;
3162  ExReleaseResourceLite(&Vcb->chunk_lock);
3163  goto end;
3164  }
3165  }
3166 
3168 
3169  le = le->Flink;
3170  }
3171 
3172  ExReleaseResourceLite(&Vcb->chunk_lock);
3173 
3174  // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3175  if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3176  bool consolidated = false;
3177  chunk* c;
3178 
3179  if (okay_metadata_chunks == 0) {
3180  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3181 
3182  Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3183  if (NT_SUCCESS(Status))
3184  c->balance_num = Vcb->balance.balance_num;
3185  else if (Status != STATUS_DISK_FULL || consolidated) {
3186  ERR("alloc_chunk returned %08lx\n", Status);
3187  ExReleaseResourceLite(&Vcb->chunk_lock);
3188  Vcb->balance.status = Status;
3189  goto end;
3190  }
3191 
3192  ExReleaseResourceLite(&Vcb->chunk_lock);
3193 
3194  if (Status == STATUS_DISK_FULL) {
3195  Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3196  if (!NT_SUCCESS(Status)) {
3197  ERR("try_consolidation returned %08lx\n", Status);
3198  Vcb->balance.status = Status;
3199  goto end;
3200  } else
3201  c->balance_num = Vcb->balance.balance_num;
3202 
3203  consolidated = true;
3204 
3205  if (Vcb->balance.stopping)
3206  goto end;
3207  }
3208  }
3209 
3210  if (okay_data_chunks == 0) {
3211  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3212 
3213  Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3214  if (NT_SUCCESS(Status))
3215  c->balance_num = Vcb->balance.balance_num;
3216  else if (Status != STATUS_DISK_FULL || consolidated) {
3217  ERR("alloc_chunk returned %08lx\n", Status);
3218  ExReleaseResourceLite(&Vcb->chunk_lock);
3219  Vcb->balance.status = Status;
3220  goto end;
3221  }
3222 
3223  ExReleaseResourceLite(&Vcb->chunk_lock);
3224 
3225  if (Status == STATUS_DISK_FULL) {
3226  Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3227  if (!NT_SUCCESS(Status)) {
3228  ERR("try_consolidation returned %08lx\n", Status);
3229  Vcb->balance.status = Status;
3230  goto end;
3231  } else
3232  c->balance_num = Vcb->balance.balance_num;
3233 
3234  consolidated = true;
3235 
3236  if (Vcb->balance.stopping)
3237  goto end;
3238  }
3239  }
3240 
3241  if (okay_system_chunks == 0) {
3242  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3243 
3244  Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3245  if (NT_SUCCESS(Status))
3246  c->balance_num = Vcb->balance.balance_num;
3247  else if (Status != STATUS_DISK_FULL || consolidated) {
3248  ERR("alloc_chunk returned %08lx\n", Status);
3249  ExReleaseResourceLite(&Vcb->chunk_lock);
3250  Vcb->balance.status = Status;
3251  goto end;
3252  }
3253 
3254  ExReleaseResourceLite(&Vcb->chunk_lock);
3255 
3256  if (Status == STATUS_DISK_FULL) {
3257  Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3258  if (!NT_SUCCESS(Status)) {
3259  ERR("try_consolidation returned %08lx\n", Status);
3260  Vcb->balance.status = Status;
3261  goto end;
3262  } else
3263  c->balance_num = Vcb->balance.balance_num;
3264 
3265  consolidated = true;
3266 
3267  if (Vcb->balance.stopping)
3268  goto end;
3269  }
3270  }
3271  }
3272 
3273  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3274 
3275  le = chunks.Flink;
3276  while (le != &chunks) {
3277  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3278 
3279  c->reloc = true;
3280 
3281  le = le->Flink;
3282  }
3283 
3284  ExReleaseResourceLite(&Vcb->chunk_lock);
3285 
3286  // do data chunks before metadata
3287  le = chunks.Flink;
3288  while (le != &chunks) {
3289  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3290  LIST_ENTRY* le2 = le->Flink;
3291 
3292  if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3293  bool changed;
3294 
3295  do {
3296  changed = false;
3297 
3298  Status = balance_data_chunk(Vcb, c, &changed);
3299  if (!NT_SUCCESS(Status)) {
3300  ERR("balance_data_chunk returned %08lx\n", Status);
3301  Vcb->balance.status = Status;
3302  goto end;
3303  }
3304 
3305  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3306 
3307  if (Vcb->readonly)
3308  Vcb->balance.stopping = true;
3309 
3310  if (Vcb->balance.stopping)
3311  break;
3312  } while (changed);
3313 
3314  c->changed = true;
3315  c->space_changed = true;
3316  }
3317 
3318  if (Vcb->balance.stopping)
3319  goto end;
3320 
3321  if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3322  (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3323  RemoveEntryList(&c->list_entry_balance);
3324  c->list_entry_balance.Flink = NULL;
3325 
3326  Vcb->balance.chunks_left--;
3327  }
3328 
3329  le = le2;
3330  }
3331 
3332  // do metadata chunks
3333  while (!IsListEmpty(&chunks)) {
3334  chunk* c;
3335  bool changed;
3336 
3337  le = RemoveHeadList(&chunks);
3338  c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3339 
3340  if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3341  do {
3342  Status = balance_metadata_chunk(Vcb, c, &changed);
3343  if (!NT_SUCCESS(Status)) {
3344  ERR("balance_metadata_chunk returned %08lx\n", Status);
3345  Vcb->balance.status = Status;
3346  goto end;
3347  }
3348 
3349  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3350 
3351  if (Vcb->readonly)
3352  Vcb->balance.stopping = true;
3353 
3354  if (Vcb->balance.stopping)
3355  break;
3356  } while (changed);
3357 
3358  c->changed = true;
3359  c->space_changed = true;
3360  }
3361 
3362  if (Vcb->balance.stopping)
3363  break;
3364 
3365  c->list_entry_balance.Flink = NULL;
3366 
3367  Vcb->balance.chunks_left--;
3368  }
3369 
3370 end:
3371  if (!Vcb->readonly) {
3372  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3373  le = chunks.Flink;
3374  while (le != &chunks) {
3375  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3376  c->reloc = false;
3377 
3378  le = le->Flink;
3379  c->list_entry_balance.Flink = NULL;
3380  }
3381 
3382  if (old_data_flags != 0)
3383  Vcb->data_flags = old_data_flags;
3384 
3385  if (old_metadata_flags != 0)
3386  Vcb->metadata_flags = old_metadata_flags;
3387 
3388  if (old_system_flags != 0)
3389  Vcb->system_flags = old_system_flags;
3390  }
3391 
3392  if (Vcb->balance.removing) {
3393  device* dev = NULL;
3394 
3395  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3396 
3397  le = Vcb->devices.Flink;
3398  while (le != &Vcb->devices) {
3399  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3400 
3401  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3402  dev = dev2;
3403  break;
3404  }
3405 
3406  le = le->Flink;
3407  }
3408 
3409  if (dev) {
3410  if (Vcb->balance.chunks_left == 0) {
3412 
3413  if (!NT_SUCCESS(Status)) {
3414  ERR("finish_removing_device returned %08lx\n", Status);
3415  dev->reloc = false;
3416  }
3417  } else
3418  dev->reloc = false;
3419  }
3420 
3421  ExReleaseResourceLite(&Vcb->tree_lock);
3422  } else if (Vcb->balance.shrinking) {
3423  device* dev = NULL;
3424 
3425  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3426 
3427  le = Vcb->devices.Flink;
3428  while (le != &Vcb->devices) {
3429  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3430 
3431  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3432  dev = dev2;
3433  break;
3434  }
3435 
3436  le = le->Flink;
3437  }
3438 
3439  if (!dev) {
3440  ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3441  Vcb->balance.status = STATUS_INTERNAL_ERROR;
3442  }
3443 
3444  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3445  if (dev) {
3447  if (!NT_SUCCESS(Status))
3448  WARN("regenerate_space_list returned %08lx\n", Status);
3449  }
3450  } else {
3451  uint64_t old_size;
3452 
3453  old_size = dev->devitem.num_bytes;
3454  dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3455 
3457  if (!NT_SUCCESS(Status)) {
3458  ERR("update_dev_item returned %08lx\n", Status);
3459  dev->devitem.num_bytes = old_size;
3460  Vcb->balance.status = Status;
3461 
3463  if (!NT_SUCCESS(Status))
3464  WARN("regenerate_space_list returned %08lx\n", Status);
3465  } else {
3466  Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3467 
3468  Status = do_write(Vcb, NULL);
3469  if (!NT_SUCCESS(Status))
3470  ERR("do_write returned %08lx\n", Status);
3471 
3472  free_trees(Vcb);
3473  }
3474  }
3475 
3476  ExReleaseResourceLite(&Vcb->tree_lock);
3477 
3478  if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3480  } else {
3482  if (!NT_SUCCESS(Status)) {
3483  ERR("remove_balance_item returned %08lx\n", Status);
3484  goto end;
3485  }
3486  }
3487 
3488  if (Vcb->trim && !Vcb->options.no_trim) {
3489  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3490 
3491  le = Vcb->devices.Flink;
3492  while (le != &Vcb->devices) {
3493  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3494 
3495  if (dev2->devobj && !dev2->readonly && dev2->trim)
3496  trim_unalloc_space(Vcb, dev2);
3497 
3498  le = le->Flink;
3499  }
3500 
3501  ExReleaseResourceLite(&Vcb->tree_lock);
3502  }
3503  }
3504 
3505  ZwClose(Vcb->balance.thread);
3506  Vcb->balance.thread = NULL;
3507 
3508  KeSetEvent(&Vcb->balance.finished, 0, false);
3509 }
3510 
3512  NTSTATUS Status;
3514  OBJECT_ATTRIBUTES oa;
3515  uint8_t i;
3516 
3517  if (length < sizeof(btrfs_start_balance) || !data)
3518  return STATUS_INVALID_PARAMETER;
3519 
3520  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3522 
3523  if (Vcb->locked) {
3524  WARN("cannot start balance while locked\n");
3525  return STATUS_DEVICE_NOT_READY;
3526  }
3527 
3528  if (Vcb->scrub.thread) {
3529  WARN("cannot start balance while scrub running\n");
3530  return STATUS_DEVICE_NOT_READY;
3531  }
3532 
3533  if (Vcb->balance.thread) {
3534  WARN("balance already running\n");
3535  return STATUS_DEVICE_NOT_READY;
3536  }
3537 
3538  if (Vcb->readonly)
3540 
3544  return STATUS_SUCCESS;
3545 
3546  for (i = 0; i < 3; i++) {
3547  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3548  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3552 
3553  if (bsb->opts[i].profiles == 0)
3554  return STATUS_INVALID_PARAMETER;
3555  }
3556 
3557  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3558  if (bsb->opts[i].devid == 0)
3559  return STATUS_INVALID_PARAMETER;
3560  }
3561 
3562  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3563  if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3564  return STATUS_INVALID_PARAMETER;
3565  }
3566 
3567  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3568  if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3569  return STATUS_INVALID_PARAMETER;
3570  }
3571 
3572  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3573  bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3574  bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3575 
3576  if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3577  return STATUS_INVALID_PARAMETER;
3578  }
3579 
3580  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3581  bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3582  bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3583 
3584  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3585  return STATUS_INVALID_PARAMETER;
3586  }
3587 
3588  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3589  bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3590  bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3591 
3592  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3593  return STATUS_INVALID_PARAMETER;
3594  }
3595 
3596  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3597  if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3599  bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3601  bsb->opts[i].convert != BLOCK_FLAG_RAID1C4)
3602  return STATUS_INVALID_PARAMETER;
3603  }
3604  }
3605  }
3606 
3607  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3610 
3611  Vcb->balance.paused = false;
3612  Vcb->balance.removing = false;
3613  Vcb->balance.shrinking = false;
3614  Vcb->balance.status = STATUS_SUCCESS;
3615  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3616 
3618 
3619  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3620  if (!NT_SUCCESS(Status)) {
3621  ERR("PsCreateSystemThread returned %08lx\n", Status);
3622  return Status;
3623  }
3624 
3625  return STATUS_SUCCESS;
3626 }
3627 
3629  KEY searchkey;
3630  traverse_ptr tp;
3631  NTSTATUS Status;
3632  BALANCE_ITEM* bi;
3633  OBJECT_ATTRIBUTES oa;
3634  int i;
3635 
3636  searchkey.obj_id = BALANCE_ITEM_ID;
3637  searchkey.obj_type = TYPE_TEMP_ITEM;
3638  searchkey.offset = 0;
3639 
3640  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3641  if (!NT_SUCCESS(Status)) {
3642  ERR("find_item returned %08lx\n", Status);
3643  return Status;
3644  }
3645 
3646  if (keycmp(tp.item->key, searchkey)) {
3647  TRACE("no balance item found\n");
3648  return STATUS_NOT_FOUND;
3649  }
3650 
3651  if (tp.item->size < sizeof(BALANCE_ITEM)) {
3652  WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3653  tp.item->size, sizeof(BALANCE_ITEM));
3654  return STATUS_INTERNAL_ERROR;
3655  }
3656 
3657  bi = (BALANCE_ITEM*)tp.item->data;
3658 
3659  if (bi->flags & BALANCE_FLAGS_DATA)
3660  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3661 
3662  if (bi->flags & BALANCE_FLAGS_METADATA)
3663  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3664 
3665  if (bi->flags & BALANCE_FLAGS_SYSTEM)
3666  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3667 
3668  // do the heuristics that Linux driver does
3669 
3670  for (i = 0; i < 3; i++) {
3671  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3672  // if converting, don't redo chunks already done
3673 
3674  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3675  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3676 
3677  // don't balance chunks more than 90% filled - presumably these
3678  // have already been done
3679 
3680  if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3681  !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3682  ) {
3683  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3684  Vcb->balance.opts[i].usage_start = 0;
3685  Vcb->balance.opts[i].usage_end = 90;
3686  }
3687  }
3688  }
3689 
3690  if (Vcb->readonly || Vcb->options.skip_balance)
3691  Vcb->balance.paused = true;
3692  else
3693  Vcb->balance.paused = false;
3694 
3695  Vcb->balance.removing = false;
3696  Vcb->balance.shrinking = false;
3697  Vcb->balance.status = STATUS_SUCCESS;
3698  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3699 
3701 
3702  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3703  if (!NT_SUCCESS(Status)) {
3704  ERR("PsCreateSystemThread returned %08lx\n", Status);
3705  return Status;
3706  }
3707 
3708  return STATUS_SUCCESS;
3709 }
3710 
3713 
3714  if (length < sizeof(btrfs_query_balance) || !data)
3715  return STATUS_INVALID_PARAMETER;
3716 
3717  if (!Vcb->balance.thread) {
3719 
3720  if (!NT_SUCCESS(Vcb->balance.status)) {
3721  bqb->status |= BTRFS_BALANCE_ERROR;
3722  bqb->error = Vcb->balance.status;
3723  }
3724 
3725  return STATUS_SUCCESS;
3726  }
3727 
3728  bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3729 
3730  if (Vcb->balance.removing)
3731  bqb->status |= BTRFS_BALANCE_REMOVAL;
3732 
3733  if (Vcb->balance.shrinking)
3735 
3736  if (!NT_SUCCESS(Vcb->balance.status))
3737  bqb->status |= BTRFS_BALANCE_ERROR;
3738 
3739  bqb->chunks_left = Vcb->balance.chunks_left;
3740  bqb->total_chunks = Vcb->balance.total_chunks;
3741  bqb->error = Vcb->balance.status;
3742  RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3743  RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3744  RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3745 
3746  return STATUS_SUCCESS;
3747 }
3748 
3750  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3752 
3753  if (!Vcb->balance.thread)
3754  return STATUS_DEVICE_NOT_READY;
3755 
3756  if (Vcb->balance.paused)
3757  return STATUS_DEVICE_NOT_READY;
3758 
3759  Vcb->balance.paused = true;
3760  KeClearEvent(&Vcb->balance.event);
3761 
3762  return STATUS_SUCCESS;
3763 }
3764 
3766  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3768 
3769  if (!Vcb->balance.thread)
3770  return STATUS_DEVICE_NOT_READY;
3771 
3772  if (!Vcb->balance.paused)
3773  return STATUS_DEVICE_NOT_READY;
3774 
3775  if (Vcb->readonly)
3777 
3778  Vcb->balance.paused = false;
3779  KeSetEvent(&Vcb->balance.event, 0, false);
3780 
3781  return STATUS_SUCCESS;
3782 }
3783 
3785  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3787 
3788  if (!Vcb->balance.thread)
3789  return STATUS_DEVICE_NOT_READY;
3790 
3791  Vcb->balance.paused = false;
3792  Vcb->balance.stopping = true;
3793  Vcb->balance.status = STATUS_SUCCESS;
3794  KeSetEvent(&Vcb->balance.event, 0, false);
3795 
3796  return STATUS_SUCCESS;
3797 }
3798 
3800  uint64_t devid;
3801  LIST_ENTRY* le;
3802  device* dev = NULL;
3803  NTSTATUS Status;
3804  int i;
3805  uint64_t num_rw_devices;
3806  OBJECT_ATTRIBUTES oa;
3807 
3808  TRACE("(%p, %p, %lx)\n", Vcb, data, length);
3809 
3810  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3812 
3813  if (length < sizeof(uint64_t))
3814  return STATUS_INVALID_PARAMETER;
3815 
3816  devid = *(uint64_t*)data;
3817 
3818  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3819 
3820  if (Vcb->readonly) {
3821  ExReleaseResourceLite(&Vcb->tree_lock);
3823  }
3824 
3825  num_rw_devices = 0;
3826 
3827  le = Vcb->devices.Flink;
3828  while (le != &Vcb->devices) {
3829  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3830 
3831  if (dev2->devitem.dev_id == devid)
3832  dev = dev2;
3833 
3834  if (!dev2->readonly)
3835  num_rw_devices++;
3836 
3837  le = le->Flink;
3838  }
3839 
3840  if (!dev) {
3841  ExReleaseResourceLite(&Vcb->tree_lock);
3842  WARN("device %I64x not found\n", devid);
3843  return STATUS_NOT_FOUND;
3844  }
3845 
3846  if (!dev->readonly) {
3847  if (num_rw_devices == 1) {
3848  ExReleaseResourceLite(&Vcb->tree_lock);
3849  WARN("not removing last non-readonly device\n");
3850  return STATUS_INVALID_PARAMETER;
3851  }
3852 
3853  if (num_rw_devices == 4 &&
3854  ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3855  (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) ||
3856  (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4)
3857  )
3858  ) {
3859  ExReleaseResourceLite(&Vcb->tree_lock);
3860  ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n");
3861  return STATUS_CANNOT_DELETE;
3862  }
3863 
3864  if (num_rw_devices == 3 &&
3865  ((Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5) ||
3866  (Vcb->data_flags & BLOCK_FLAG_RAID1C3 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C3 || Vcb->system_flags & BLOCK_FLAG_RAID1C3))
3867  ) {
3868  ExReleaseResourceLite(&Vcb->tree_lock);
3869  ERR("would not be enough devices to satisfy RAID requirement (RAID5/1C3)\n");
3870  return STATUS_CANNOT_DELETE;
3871  }
3872 
3873  if (num_rw_devices == 2 &&
3874  ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3875  (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags &