ReactOS  0.4.15-dev-1206-g731eddf
balance.c
Go to the documentation of this file.
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include "crc32c.h"
21 #include <ntddstor.h>
22 
23 typedef struct {
28  tree* t;
29  bool system;
33 
34 typedef struct {
37 
38  union {
41  };
42 
44  bool top;
47 
48 typedef struct {
56 } data_reloc;
57 
58 typedef struct {
61 
62  union {
65  };
66 
70 
71 #ifndef _MSC_VER // not in mingw yet
72 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
73 #endif
74 
75 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
76 
78  bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
80  metadata_reloc* mr;
81  EXTENT_ITEM* ei;
82  uint16_t len;
83  uint64_t inline_rc;
84  uint8_t* ptr;
85 
87  if (!mr) {
88  ERR("out of memory\n");
90  }
91 
92  mr->address = tp->item->key.obj_id;
93  mr->data = NULL;
94  mr->ei = (EXTENT_ITEM*)tp->item->data;
95  mr->system = false;
97 
99  if (!NT_SUCCESS(Status)) {
100  ERR("delete_tree_item returned %08lx\n", Status);
101  ExFreePool(mr);
102  return Status;
103  }
104 
105  if (!c)
107 
108  if (c) {
110 
111  c->used -= Vcb->superblock.node_size;
112 
113  space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
114 
116  }
117 
118  ei = (EXTENT_ITEM*)tp->item->data;
119  inline_rc = 0;
120 
121  len = tp->item->size - sizeof(EXTENT_ITEM);
122  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
123  if (!skinny) {
124  len -= sizeof(EXTENT_ITEM2);
125  ptr += sizeof(EXTENT_ITEM2);
126  }
127 
128  while (len > 0) {
129  uint8_t secttype = *ptr;
130  uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
132 
133  len--;
134 
135  if (sectlen > len) {
136  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
137  return STATUS_INTERNAL_ERROR;
138  }
139 
140  if (sectlen == 0) {
141  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
142  return STATUS_INTERNAL_ERROR;
143  }
144 
146  if (!ref) {
147  ERR("out of memory\n");
149  }
150 
151  if (secttype == TYPE_TREE_BLOCK_REF) {
152  ref->type = TYPE_TREE_BLOCK_REF;
153  RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
154  inline_rc++;
155  } else if (secttype == TYPE_SHARED_BLOCK_REF) {
156  ref->type = TYPE_SHARED_BLOCK_REF;
157  RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
158  inline_rc++;
159  } else {
160  ERR("unexpected tree type %x\n", secttype);
161  ExFreePool(ref);
162  return STATUS_INTERNAL_ERROR;
163  }
164 
165  ref->parent = NULL;
166  ref->top = false;
168 
169  len -= sectlen;
170  ptr += sizeof(uint8_t) + sectlen;
171  }
172 
173  if (inline_rc < ei->refcount) { // look for non-inline entries
174  traverse_ptr tp2 = *tp, next_tp;
175 
176  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
177  tp2 = next_tp;
178 
179  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
180  if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
182  if (!ref) {
183  ERR("out of memory\n");
185  }
186 
187  ref->type = TYPE_TREE_BLOCK_REF;
188  ref->tbr.offset = tp2.item->key.offset;
189  ref->parent = NULL;
190  ref->top = false;
192 
193  Status = delete_tree_item(Vcb, &tp2);
194  if (!NT_SUCCESS(Status)) {
195  ERR("delete_tree_item returned %08lx\n", Status);
196  return Status;
197  }
198  } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
200  if (!ref) {
201  ERR("out of memory\n");
203  }
204 
205  ref->type = TYPE_SHARED_BLOCK_REF;
206  ref->sbr.offset = tp2.item->key.offset;
207  ref->parent = NULL;
208  ref->top = false;
210 
211  Status = delete_tree_item(Vcb, &tp2);
212  if (!NT_SUCCESS(Status)) {
213  ERR("delete_tree_item returned %08lx\n", Status);
214  return Status;
215  }
216  }
217  } else
218  break;
219  }
220  }
221 
223 
224  if (mr2)
225  *mr2 = mr;
226 
227  return STATUS_SUCCESS;
228 }
229 
232  LIST_ENTRY* le;
233  KEY searchkey;
235  bool skinny = false;
237 
238  le = items->Flink;
239  while (le != items) {
241 
242  if (mr->address == address) {
243  *mr2 = mr;
244  return STATUS_SUCCESS;
245  }
246 
247  le = le->Flink;
248  }
249 
250  searchkey.obj_id = address;
251  searchkey.obj_type = TYPE_METADATA_ITEM;
252  searchkey.offset = 0xffffffffffffffff;
253 
254  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
255  if (!NT_SUCCESS(Status)) {
256  ERR("find_item returned %08lx\n", Status);
257  return Status;
258  }
259 
261  skinny = true;
262  else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
263  tp.item->size >= sizeof(EXTENT_ITEM)) {
265 
266  if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
267  ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
268  return STATUS_INTERNAL_ERROR;
269  }
270  } else {
271  ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
272  return STATUS_INTERNAL_ERROR;
273  }
274 
275  Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
276  if (!NT_SUCCESS(Status)) {
277  ERR("add_metadata_reloc returned %08lx\n", Status);
278  return Status;
279  }
280 
281  return STATUS_SUCCESS;
282 }
283 
285  LIST_ENTRY newlist, *le;
286 
287  if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
288  return;
289 
290  // insertion sort
291 
292  InitializeListHead(&newlist);
293 
294  while (!IsListEmpty(&mr->refs)) {
296  bool inserted = false;
297 
298  if (ref->type == TYPE_TREE_BLOCK_REF)
299  ref->hash = ref->tbr.offset;
300  else if (ref->type == TYPE_SHARED_BLOCK_REF)
301  ref->hash = ref->parent->new_address;
302 
303  le = newlist.Flink;
304  while (le != &newlist) {
306 
307  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
309  inserted = true;
310  break;
311  }
312 
313  le = le->Flink;
314  }
315 
316  if (!inserted)
317  InsertTailList(&newlist, &ref->list_entry);
318  }
319 
320  newlist.Flink->Blink = &mr->refs;
321  newlist.Blink->Flink = &mr->refs;
322  mr->refs.Flink = newlist.Flink;
323  mr->refs.Blink = newlist.Blink;
324 }
325 
328  LIST_ENTRY* le;
329  uint64_t rc = 0;
330  uint16_t inline_len;
331  bool all_inline = true;
332  metadata_reloc_ref* first_noninline = NULL;
333  EXTENT_ITEM* ei;
334  uint8_t* ptr;
335 
336  inline_len = sizeof(EXTENT_ITEM);
337  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
338  inline_len += sizeof(EXTENT_ITEM2);
339 
341 
342  le = mr->refs.Flink;
343  while (le != &mr->refs) {
345  uint16_t extlen = 0;
346 
347  rc++;
348 
349  if (ref->type == TYPE_TREE_BLOCK_REF)
350  extlen += sizeof(TREE_BLOCK_REF);
351  else if (ref->type == TYPE_SHARED_BLOCK_REF)
352  extlen += sizeof(SHARED_BLOCK_REF);
353 
354  if (all_inline) {
355  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
356  all_inline = false;
357  first_noninline = ref;
358  } else
359  inline_len += extlen + 1;
360  }
361 
362  le = le->Flink;
363  }
364 
365  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
366  if (!ei) {
367  ERR("out of memory\n");
369  }
370 
371  ei->refcount = rc;
372  ei->generation = mr->ei->generation;
373  ei->flags = mr->ei->flags;
374  ptr = (uint8_t*)&ei[1];
375 
376  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
377  EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
378 
379  ei2->firstitem = *(KEY*)&mr->data[1];
380  ei2->level = mr->data->level;
381 
382  ptr += sizeof(EXTENT_ITEM2);
383  }
384 
385  le = mr->refs.Flink;
386  while (le != &mr->refs) {
388 
389  if (ref == first_noninline)
390  break;
391 
392  *ptr = ref->type;
393  ptr++;
394 
395  if (ref->type == TYPE_TREE_BLOCK_REF) {
397 
398  tbr->offset = ref->tbr.offset;
399 
400  ptr += sizeof(TREE_BLOCK_REF);
401  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
403 
404  sbr->offset = ref->parent->new_address;
405 
406  ptr += sizeof(SHARED_BLOCK_REF);
407  }
408 
409  le = le->Flink;
410  }
411 
412  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
413  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
414  else
415  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
416 
417  if (!NT_SUCCESS(Status)) {
418  ERR("insert_tree_item returned %08lx\n", Status);
419  ExFreePool(ei);
420  return Status;
421  }
422 
423  if (!all_inline) {
424  le = &first_noninline->list_entry;
425 
426  while (le != &mr->refs) {
428 
429  if (ref->type == TYPE_TREE_BLOCK_REF) {
430  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
431  if (!NT_SUCCESS(Status)) {
432  ERR("insert_tree_item returned %08lx\n", Status);
433  return Status;
434  }
435  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
436  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
437  if (!NT_SUCCESS(Status)) {
438  ERR("insert_tree_item returned %08lx\n", Status);
439  return Status;
440  }
441  }
442 
443  le = le->Flink;
444  }
445  }
446 
448  if (mr->data->level > 0) {
449  uint16_t i;
450  internal_node* in = (internal_node*)&mr->data[1];
451 
452  for (i = 0; i < mr->data->num_items; i++) {
454 
455  if (sbrrc > 0) {
456  SHARED_BLOCK_REF sbr;
457 
458  sbr.offset = mr->new_address;
459 
460  Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
461  if (!NT_SUCCESS(Status)) {
462  ERR("increase_extent_refcount returned %08lx\n", Status);
463  return Status;
464  }
465 
466  sbr.offset = mr->address;
467 
468  Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
469  sbr.offset, false, NULL);
470  if (!NT_SUCCESS(Status)) {
471  ERR("decrease_extent_refcount returned %08lx\n", Status);
472  return Status;
473  }
474  }
475  }
476  } else {
477  uint16_t i;
478  leaf_node* ln = (leaf_node*)&mr->data[1];
479 
480  for (i = 0; i < mr->data->num_items; i++) {
481  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
482  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
483 
486 
487  if (ed2->size > 0) { // not sparse
489 
490  if (sdrrc > 0) {
491  SHARED_DATA_REF sdr;
492  chunk* c;
493 
494  sdr.offset = mr->new_address;
495  sdr.count = sdrrc;
496 
498  if (!NT_SUCCESS(Status)) {
499  ERR("increase_extent_refcount returned %08lx\n", Status);
500  return Status;
501  }
502 
503  sdr.offset = mr->address;
504 
506  sdr.offset, false, NULL);
507  if (!NT_SUCCESS(Status)) {
508  ERR("decrease_extent_refcount returned %08lx\n", Status);
509  return Status;
510  }
511 
513 
514  if (c) {
515  // check changed_extents
516 
517  ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
518 
519  le = c->changed_extents.Flink;
520 
521  while (le != &c->changed_extents) {
523 
524  if (ce->address == ed2->address) {
525  LIST_ENTRY* le2;
526 
527  le2 = ce->refs.Flink;
528  while (le2 != &ce->refs) {
530 
531  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
532  cer->sdr.offset = mr->new_address;
533  break;
534  }
535 
536  le2 = le2->Flink;
537  }
538 
539  le2 = ce->old_refs.Flink;
540  while (le2 != &ce->old_refs) {
542 
543  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
544  cer->sdr.offset = mr->new_address;
545  break;
546  }
547 
548  le2 = le2->Flink;
549  }
550 
551  break;
552  }
553 
554  le = le->Flink;
555  }
556 
557  ExReleaseResourceLite(&c->changed_extents_lock);
558  }
559  }
560  }
561  }
562  }
563  }
564  }
565  }
566 
567  return STATUS_SUCCESS;
568 }
569 
571  LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
572  LIST_ENTRY tree_writes, *le;
575  uint8_t level, max_level = 0;
576  chunk* newchunk = NULL;
577 
578  InitializeListHead(&tree_writes);
579 
580  le = items->Flink;
581  while (le != items) {
583  LIST_ENTRY* le2;
584  chunk* pc;
585 
586  mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
587  if (!mr->data) {
588  ERR("out of memory\n");
590  }
591 
592  Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
593  c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
594  if (!NT_SUCCESS(Status)) {
595  ERR("read_data returned %08lx\n", Status);
596  return Status;
597  }
598 
599  if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
600  mr->system = true;
601 
602  if (data_items && mr->data->level == 0) {
603  le2 = data_items->Flink;
604  while (le2 != data_items) {
606  leaf_node* ln = (leaf_node*)&mr->data[1];
607  uint16_t i;
608 
609  for (i = 0; i < mr->data->num_items; i++) {
610  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
611  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
612 
615 
616  if (ed2->address == dr->address)
617  ed2->address = dr->new_address;
618  }
619  }
620  }
621 
622  le2 = le2->Flink;
623  }
624  }
625 
626  if (mr->data->level > max_level)
627  max_level = mr->data->level;
628 
629  le2 = mr->refs.Flink;
630  while (le2 != &mr->refs) {
632 
633  if (ref->type == TYPE_TREE_BLOCK_REF) {
634  KEY* firstitem;
635  root* r = NULL;
636  LIST_ENTRY* le3;
637  tree* t;
638 
639  firstitem = (KEY*)&mr->data[1];
640 
641  le3 = Vcb->roots.Flink;
642  while (le3 != &Vcb->roots) {
644 
645  if (r2->id == ref->tbr.offset) {
646  r = r2;
647  break;
648  }
649 
650  le3 = le3->Flink;
651  }
652 
653  if (!r) {
654  ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
655  return STATUS_INTERNAL_ERROR;
656  }
657 
658  Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
660  ERR("find_item_to_level returned %08lx\n", Status);
661  return Status;
662  }
663 
664  t = tp.tree;
665  while (t && t->header.level < mr->data->level + 1) {
666  t = t->parent;
667  }
668 
669  if (!t)
670  ref->top = true;
671  else {
672  metadata_reloc* mr2;
673 
674  Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
675  if (!NT_SUCCESS(Status)) {
676  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
677  return Status;
678  }
679 
680  ref->parent = mr2;
681  }
682  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
683  metadata_reloc* mr2;
684 
685  Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
686  if (!NT_SUCCESS(Status)) {
687  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
688  return Status;
689  }
690 
691  ref->parent = mr2;
692  }
693 
694  le2 = le2->Flink;
695  }
696 
697  le = le->Flink;
698  }
699 
700  le = items->Flink;
701  while (le != items) {
703  LIST_ENTRY* le2;
704  uint32_t hash;
705 
706  mr->t = NULL;
707 
708  hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
709 
710  le2 = Vcb->trees_ptrs[hash >> 24];
711 
712  if (le2) {
713  while (le2 != &Vcb->trees_hash) {
714  tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
715 
716  if (t->header.address == mr->address) {
717  mr->t = t;
718  break;
719  } else if (t->hash > hash)
720  break;
721 
722  le2 = le2->Flink;
723  }
724  }
725 
726  le = le->Flink;
727  }
728 
729  for (level = 0; level <= max_level; level++) {
730  le = items->Flink;
731  while (le != items) {
733 
734  if (mr->data->level == level) {
735  bool done = false;
736  LIST_ENTRY* le2;
737  tree_write* tw;
738  uint64_t flags;
739  tree* t3;
740 
741  if (mr->system)
742  flags = Vcb->system_flags;
743  else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
744  flags = Vcb->data_flags;
745  else
746  flags = Vcb->metadata_flags;
747 
748  if (newchunk) {
749  acquire_chunk_lock(newchunk, Vcb);
750 
751  if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
752  newchunk->used += Vcb->superblock.node_size;
753  space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
754  done = true;
755  }
756 
757  release_chunk_lock(newchunk, Vcb);
758  }
759 
760  if (!done) {
761  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
762 
763  le2 = Vcb->chunks.Flink;
764  while (le2 != &Vcb->chunks) {
766 
767  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
768  acquire_chunk_lock(c2, Vcb);
769 
770  if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
772  c2->used += Vcb->superblock.node_size;
773  space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
774  release_chunk_lock(c2, Vcb);
775  newchunk = c2;
776  done = true;
777  break;
778  }
779  }
780 
781  release_chunk_lock(c2, Vcb);
782  }
783 
784  le2 = le2->Flink;
785  }
786 
787  // allocate new chunk if necessary
788  if (!done) {
789  Status = alloc_chunk(Vcb, flags, &newchunk, false);
790 
791  if (!NT_SUCCESS(Status)) {
792  ERR("alloc_chunk returned %08lx\n", Status);
793  ExReleaseResourceLite(&Vcb->chunk_lock);
794  goto end;
795  }
796 
797  acquire_chunk_lock(newchunk, Vcb);
798 
799  newchunk->balance_num = Vcb->balance.balance_num;
800 
801  if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
802  release_chunk_lock(newchunk, Vcb);
803  ExReleaseResourceLite(&Vcb->chunk_lock);
804  ERR("could not find address in new chunk\n");
806  goto end;
807  } else {
808  newchunk->used += Vcb->superblock.node_size;
809  space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
810  }
811 
812  release_chunk_lock(newchunk, Vcb);
813  }
814 
815  ExReleaseResourceLite(&Vcb->chunk_lock);
816  }
817 
818  // update parents
819  le2 = mr->refs.Flink;
820  while (le2 != &mr->refs) {
822 
823  if (ref->parent) {
824  uint16_t i;
825  internal_node* in = (internal_node*)&ref->parent->data[1];
826 
827  for (i = 0; i < ref->parent->data->num_items; i++) {
828  if (in[i].address == mr->address) {
829  in[i].address = mr->new_address;
830  break;
831  }
832  }
833 
834  if (ref->parent->t) {
835  LIST_ENTRY* le3;
836 
837  le3 = ref->parent->t->itemlist.Flink;
838  while (le3 != &ref->parent->t->itemlist) {
840 
841  if (!td->inserted && td->treeholder.address == mr->address)
842  td->treeholder.address = mr->new_address;
843 
844  le3 = le3->Flink;
845  }
846  }
847  } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
848  LIST_ENTRY* le3;
849  root* r = NULL;
850 
851  // alter ROOT_ITEM
852 
853  le3 = Vcb->roots.Flink;
854  while (le3 != &Vcb->roots) {
856 
857  if (r2->id == ref->tbr.offset) {
858  r = r2;
859  break;
860  }
861 
862  le3 = le3->Flink;
863  }
864 
865  if (r) {
866  r->treeholder.address = mr->new_address;
867 
868  if (r == Vcb->root_root)
869  Vcb->superblock.root_tree_addr = mr->new_address;
870  else if (r == Vcb->chunk_root)
871  Vcb->superblock.chunk_tree_addr = mr->new_address;
872  else if (r->root_item.block_number == mr->address) {
873  KEY searchkey;
874  ROOT_ITEM* ri;
875 
876  r->root_item.block_number = mr->new_address;
877 
878  searchkey.obj_id = r->id;
879  searchkey.obj_type = TYPE_ROOT_ITEM;
880  searchkey.offset = 0xffffffffffffffff;
881 
882  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
883  if (!NT_SUCCESS(Status)) {
884  ERR("find_item returned %08lx\n", Status);
885  goto end;
886  }
887 
888  if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
889  ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
891  goto end;
892  }
893 
895  if (!ri) {
896  ERR("out of memory\n");
898  goto end;
899  }
900 
901  RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
902 
904  if (!NT_SUCCESS(Status)) {
905  ERR("delete_tree_item returned %08lx\n", Status);
906  goto end;
907  }
908 
909  Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
910  if (!NT_SUCCESS(Status)) {
911  ERR("insert_tree_item returned %08lx\n", Status);
912  goto end;
913  }
914  }
915  }
916  }
917 
918  le2 = le2->Flink;
919  }
920 
921  mr->data->address = mr->new_address;
922 
923  t3 = mr->t;
924 
925  while (t3) {
926  uint8_t h;
927  bool inserted;
928  tree* t4 = NULL;
929 
930  // check if tree loaded more than once
931  if (t3->list_entry.Flink != &Vcb->trees_hash) {
932  tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
933 
934  if (nt->header.address == t3->header.address)
935  t4 = nt;
936  }
937 
938  t3->header.address = mr->new_address;
939 
940  h = t3->hash >> 24;
941 
942  if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
943  if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
944  Vcb->trees_ptrs[h] = NULL;
945  else {
946  tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
947 
948  if (t2->hash >> 24 == h)
949  Vcb->trees_ptrs[h] = &t2->list_entry_hash;
950  else
951  Vcb->trees_ptrs[h] = NULL;
952  }
953  }
954 
956 
957  t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
958  h = t3->hash >> 24;
959 
960  if (!Vcb->trees_ptrs[h]) {
961  uint8_t h2 = h;
962 
963  le2 = Vcb->trees_hash.Flink;
964 
965  if (h2 > 0) {
966  h2--;
967  do {
968  if (Vcb->trees_ptrs[h2]) {
969  le2 = Vcb->trees_ptrs[h2];
970  break;
971  }
972 
973  h2--;
974  } while (h2 > 0);
975  }
976  } else
977  le2 = Vcb->trees_ptrs[h];
978 
979  inserted = false;
980  while (le2 != &Vcb->trees_hash) {
981  tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
982 
983  if (t2->hash >= t3->hash) {
985  inserted = true;
986  break;
987  }
988 
989  le2 = le2->Flink;
990  }
991 
992  if (!inserted)
993  InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
994 
995  if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
996  Vcb->trees_ptrs[h] = &t3->list_entry_hash;
997 
998  if (data_items && level == 0) {
999  le2 = data_items->Flink;
1000 
1001  while (le2 != data_items) {
1003  LIST_ENTRY* le3 = t3->itemlist.Flink;
1004 
1005  while (le3 != &t3->itemlist) {
1007 
1008  if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1009  EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1010 
1013 
1014  if (ed2->address == dr->address)
1015  ed2->address = dr->new_address;
1016  }
1017  }
1018 
1019  le3 = le3->Flink;
1020  }
1021 
1022  le2 = le2->Flink;
1023  }
1024  }
1025 
1026  t3 = t4;
1027  }
1028 
1029  calc_tree_checksum(Vcb, mr->data);
1030 
1032  if (!tw) {
1033  ERR("out of memory\n");
1035  goto end;
1036  }
1037 
1038  tw->address = mr->new_address;
1039  tw->length = Vcb->superblock.node_size;
1040  tw->data = (uint8_t*)mr->data;
1041  tw->allocated = false;
1042 
1043  if (IsListEmpty(&tree_writes))
1044  InsertTailList(&tree_writes, &tw->list_entry);
1045  else {
1046  bool inserted = false;
1047 
1048  le2 = tree_writes.Flink;
1049  while (le2 != &tree_writes) {
1051 
1052  if (tw2->address > tw->address) {
1053  InsertHeadList(le2->Blink, &tw->list_entry);
1054  inserted = true;
1055  break;
1056  }
1057 
1058  le2 = le2->Flink;
1059  }
1060 
1061  if (!inserted)
1062  InsertTailList(&tree_writes, &tw->list_entry);
1063  }
1064  }
1065 
1066  le = le->Flink;
1067  }
1068  }
1069 
1070  Status = do_tree_writes(Vcb, &tree_writes, true);
1071  if (!NT_SUCCESS(Status)) {
1072  ERR("do_tree_writes returned %08lx\n", Status);
1073  goto end;
1074  }
1075 
1076  le = items->Flink;
1077  while (le != items) {
1079 
1081  if (!NT_SUCCESS(Status)) {
1082  ERR("add_metadata_reloc_extent_item returned %08lx\n", Status);
1083  goto end;
1084  }
1085 
1086  le = le->Flink;
1087  }
1088 
1090 
1091 end:
1092  while (!IsListEmpty(&tree_writes)) {
1094 
1095  if (tw->allocated)
1096  ExFreePool(tw->data);
1097 
1098  ExFreePool(tw);
1099  }
1100 
1101  return Status;
1102 }
1103 
1105  KEY searchkey;
1106  traverse_ptr tp;
1107  NTSTATUS Status;
1108  bool b;
1110  uint32_t loaded = 0;
1111 
1112  TRACE("chunk %I64x\n", c->offset);
1113 
1116 
1117  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1118 
1119  searchkey.obj_id = c->offset;
1120  searchkey.obj_type = TYPE_METADATA_ITEM;
1121  searchkey.offset = 0xffffffffffffffff;
1122 
1123  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1124  if (!NT_SUCCESS(Status)) {
1125  ERR("find_item returned %08lx\n", Status);
1126  goto end;
1127  }
1128 
1129  do {
1130  traverse_ptr next_tp;
1131 
1132  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1133  break;
1134 
1135  if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1136  bool tree = false, skinny = false;
1137 
1138  if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1139  tree = true;
1140  skinny = true;
1141  } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1142  tp.item->size >= sizeof(EXTENT_ITEM)) {
1143  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1144 
1146  tree = true;
1147  }
1148 
1149  if (tree) {
1150  Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1151 
1152  if (!NT_SUCCESS(Status)) {
1153  ERR("add_metadata_reloc returned %08lx\n", Status);
1154  goto end;
1155  }
1156 
1157  loaded++;
1158 
1159  if (loaded >= 64) // only do 64 at a time
1160  break;
1161  }
1162  }
1163 
1164  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1165 
1166  if (b)
1167  tp = next_tp;
1168  } while (b);
1169 
1170  if (IsListEmpty(&items)) {
1171  *changed = false;
1173  goto end;
1174  } else
1175  *changed = true;
1176 
1178  if (!NT_SUCCESS(Status)) {
1179  ERR("write_metadata_items returned %08lx\n", Status);
1180  goto end;
1181  }
1182 
1184 
1185  Vcb->need_write = true;
1186 
1187 end:
1188  if (NT_SUCCESS(Status)) {
1189  Status = do_write(Vcb, NULL);
1190  if (!NT_SUCCESS(Status))
1191  ERR("do_write returned %08lx\n", Status);
1192  }
1193 
1194  if (NT_SUCCESS(Status))
1196  else
1198 
1199  free_trees(Vcb);
1200 
1201  ExReleaseResourceLite(&Vcb->tree_lock);
1202 
1203  while (!IsListEmpty(&items)) {
1205 
1206  while (!IsListEmpty(&mr->refs)) {
1208 
1209  ExFreePool(ref);
1210  }
1211 
1212  if (mr->data)
1213  ExFreePool(mr->data);
1214 
1215  ExFreePool(mr);
1216  }
1217 
1218  return Status;
1219 }
1220 
1223  NTSTATUS Status;
1224  LIST_ENTRY* le;
1225  KEY searchkey;
1226  traverse_ptr tp;
1227  root* r = NULL;
1228  metadata_reloc* mr;
1229  uint64_t last_tree = 0;
1231 
1232  le = Vcb->roots.Flink;
1233  while (le != &Vcb->roots) {
1235 
1236  if (r2->id == edr->root) {
1237  r = r2;
1238  break;
1239  }
1240 
1241  le = le->Flink;
1242  }
1243 
1244  if (!r) {
1245  ERR("could not find subvol %I64x\n", edr->root);
1246  return STATUS_INTERNAL_ERROR;
1247  }
1248 
1249  searchkey.obj_id = edr->objid;
1250  searchkey.obj_type = TYPE_EXTENT_DATA;
1251  searchkey.offset = 0;
1252 
1253  Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1254  if (!NT_SUCCESS(Status)) {
1255  ERR("find_item returned %08lx\n", Status);
1256  return Status;
1257  }
1258 
1259  if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1260  traverse_ptr tp2;
1261 
1262  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1263  tp = tp2;
1264  else {
1265  ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1266  return STATUS_INTERNAL_ERROR;
1267  }
1268  }
1269 
1270  ref = NULL;
1271 
1272  while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1273  traverse_ptr tp2;
1274 
1275  if (tp.item->size >= sizeof(EXTENT_DATA)) {
1277 
1280 
1281  if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1282  if (ref && last_tree == tp.tree->header.address)
1283  ref->edr.count++;
1284  else {
1286  if (!ref) {
1287  ERR("out of memory\n");
1289  }
1290 
1291  ref->type = TYPE_EXTENT_DATA_REF;
1292  RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1293  ref->edr.count = 1;
1294 
1295  Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1296  if (!NT_SUCCESS(Status)) {
1297  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1298  ExFreePool(ref);
1299  return Status;
1300  }
1301 
1302  last_tree = tp.tree->header.address;
1303  ref->parent = mr;
1304 
1305  InsertTailList(&dr->refs, &ref->list_entry);
1306  }
1307  }
1308  }
1309  }
1310 
1311  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1312  tp = tp2;
1313  else
1314  break;
1315  }
1316 
1317  return STATUS_SUCCESS;
1318 }
1319 
1322  NTSTATUS Status;
1323  data_reloc* dr;
1324  EXTENT_ITEM* ei;
1325  uint16_t len;
1326  uint64_t inline_rc;
1327  uint8_t* ptr;
1328 
1330  if (!dr) {
1331  ERR("out of memory\n");
1333  }
1334 
1335  dr->address = tp->item->key.obj_id;
1336  dr->size = tp->item->key.offset;
1337  dr->ei = (EXTENT_ITEM*)tp->item->data;
1338  InitializeListHead(&dr->refs);
1339 
1341  if (!NT_SUCCESS(Status)) {
1342  ERR("delete_tree_item returned %08lx\n", Status);
1343  return Status;
1344  }
1345 
1346  if (!c)
1348 
1349  if (c) {
1351 
1352  c->used -= tp->item->key.offset;
1353 
1355 
1357  }
1358 
1359  ei = (EXTENT_ITEM*)tp->item->data;
1360  inline_rc = 0;
1361 
1362  len = tp->item->size - sizeof(EXTENT_ITEM);
1363  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1364 
1365  while (len > 0) {
1366  uint8_t secttype = *ptr;
1367  uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1368 
1369  len--;
1370 
1371  if (sectlen > len) {
1372  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1373  return STATUS_INTERNAL_ERROR;
1374  }
1375 
1376  if (sectlen == 0) {
1377  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1378  return STATUS_INTERNAL_ERROR;
1379  }
1380 
1381  if (secttype == TYPE_EXTENT_DATA_REF) {
1382  EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1383 
1384  inline_rc += edr->count;
1385 
1386  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1387  if (!NT_SUCCESS(Status)) {
1388  ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1389  return Status;
1390  }
1391  } else if (secttype == TYPE_SHARED_DATA_REF) {
1392  metadata_reloc* mr;
1394 
1396  if (!ref) {
1397  ERR("out of memory\n");
1399  }
1400 
1401  ref->type = TYPE_SHARED_DATA_REF;
1402  RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1403  inline_rc += ref->sdr.count;
1404 
1405  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1406  if (!NT_SUCCESS(Status)) {
1407  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1408  ExFreePool(ref);
1409  return Status;
1410  }
1411 
1412  ref->parent = mr;
1413 
1414  InsertTailList(&dr->refs, &ref->list_entry);
1415  } else {
1416  ERR("unexpected tree type %x\n", secttype);
1417  return STATUS_INTERNAL_ERROR;
1418  }
1419 
1420 
1421  len -= sectlen;
1422  ptr += sizeof(uint8_t) + sectlen;
1423  }
1424 
1425  if (inline_rc < ei->refcount) { // look for non-inline entries
1426  traverse_ptr tp2 = *tp, next_tp;
1427 
1428  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1429  tp2 = next_tp;
1430 
1431  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1432  if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1433  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1434  if (!NT_SUCCESS(Status)) {
1435  ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1436  return Status;
1437  }
1438 
1439  Status = delete_tree_item(Vcb, &tp2);
1440  if (!NT_SUCCESS(Status)) {
1441  ERR("delete_tree_item returned %08lx\n", Status);
1442  return Status;
1443  }
1444  } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1445  metadata_reloc* mr;
1447 
1449  if (!ref) {
1450  ERR("out of memory\n");
1452  }
1453 
1454  ref->type = TYPE_SHARED_DATA_REF;
1455  ref->sdr.offset = tp2.item->key.offset;
1456  ref->sdr.count = *((uint32_t*)tp2.item->data);
1457 
1458  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1459  if (!NT_SUCCESS(Status)) {
1460  ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1461  ExFreePool(ref);
1462  return Status;
1463  }
1464 
1465  ref->parent = mr;
1466  InsertTailList(&dr->refs, &ref->list_entry);
1467 
1468  Status = delete_tree_item(Vcb, &tp2);
1469  if (!NT_SUCCESS(Status)) {
1470  ERR("delete_tree_item returned %08lx\n", Status);
1471  return Status;
1472  }
1473  }
1474  } else
1475  break;
1476  }
1477  }
1478 
1480 
1481  return STATUS_SUCCESS;
1482 }
1483 
1485  LIST_ENTRY newlist, *le;
1486 
1487  if (IsListEmpty(&dr->refs))
1488  return;
1489 
1490  // insertion sort
1491 
1492  InitializeListHead(&newlist);
1493 
1494  while (!IsListEmpty(&dr->refs)) {
1496  bool inserted = false;
1497 
1498  if (ref->type == TYPE_EXTENT_DATA_REF)
1499  ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1500  else if (ref->type == TYPE_SHARED_DATA_REF)
1501  ref->hash = ref->parent->new_address;
1502 
1503  le = newlist.Flink;
1504  while (le != &newlist) {
1506 
1507  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1509  inserted = true;
1510  break;
1511  }
1512 
1513  le = le->Flink;
1514  }
1515 
1516  if (!inserted)
1517  InsertTailList(&newlist, &ref->list_entry);
1518  }
1519 
1520  le = newlist.Flink;
1521  while (le != &newlist) {
1523 
1524  if (le->Flink != &newlist) {
1526 
1527  if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1528  ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1529  RemoveEntryList(&ref2->list_entry);
1530  ref->edr.count += ref2->edr.count;
1531  ExFreePool(ref2);
1532  continue;
1533  }
1534  }
1535 
1536  le = le->Flink;
1537  }
1538 
1539  newlist.Flink->Blink = &dr->refs;
1540  newlist.Blink->Flink = &dr->refs;
1541  dr->refs.Flink = newlist.Flink;
1542  dr->refs.Blink = newlist.Blink;
1543 }
1544 
1546  NTSTATUS Status;
1547  LIST_ENTRY* le;
1548  uint64_t rc = 0;
1549  uint16_t inline_len;
1550  bool all_inline = true;
1551  data_reloc_ref* first_noninline = NULL;
1552  EXTENT_ITEM* ei;
1553  uint8_t* ptr;
1554 
1555  inline_len = sizeof(EXTENT_ITEM);
1556 
1558 
1559  le = dr->refs.Flink;
1560  while (le != &dr->refs) {
1562  uint16_t extlen = 0;
1563 
1564  if (ref->type == TYPE_EXTENT_DATA_REF) {
1565  extlen += sizeof(EXTENT_DATA_REF);
1566  rc += ref->edr.count;
1567  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1568  extlen += sizeof(SHARED_DATA_REF);
1569  rc++;
1570  }
1571 
1572  if (all_inline) {
1573  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1574  all_inline = false;
1575  first_noninline = ref;
1576  } else
1577  inline_len += extlen + 1;
1578  }
1579 
1580  le = le->Flink;
1581  }
1582 
1583  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1584  if (!ei) {
1585  ERR("out of memory\n");
1587  }
1588 
1589  ei->refcount = rc;
1590  ei->generation = dr->ei->generation;
1591  ei->flags = dr->ei->flags;
1592  ptr = (uint8_t*)&ei[1];
1593 
1594  le = dr->refs.Flink;
1595  while (le != &dr->refs) {
1597 
1598  if (ref == first_noninline)
1599  break;
1600 
1601  *ptr = ref->type;
1602  ptr++;
1603 
1604  if (ref->type == TYPE_EXTENT_DATA_REF) {
1606 
1607  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1608 
1609  ptr += sizeof(EXTENT_DATA_REF);
1610  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1612 
1613  sdr->offset = ref->parent->new_address;
1614  sdr->count = ref->sdr.count;
1615 
1616  ptr += sizeof(SHARED_DATA_REF);
1617  }
1618 
1619  le = le->Flink;
1620  }
1621 
1622  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1623  if (!NT_SUCCESS(Status)) {
1624  ERR("insert_tree_item returned %08lx\n", Status);
1625  return Status;
1626  }
1627 
1628  if (!all_inline) {
1629  le = &first_noninline->list_entry;
1630 
1631  while (le != &dr->refs) {
1633 
1634  if (ref->type == TYPE_EXTENT_DATA_REF) {
1635  EXTENT_DATA_REF* edr;
1636 
1638  if (!edr) {
1639  ERR("out of memory\n");
1641  }
1642 
1643  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1644 
1645  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1646  if (!NT_SUCCESS(Status)) {
1647  ERR("insert_tree_item returned %08lx\n", Status);
1648  return Status;
1649  }
1650  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1651  uint32_t* sdr;
1652 
1654  if (!sdr) {
1655  ERR("out of memory\n");
1657  }
1658 
1659  *sdr = ref->sdr.count;
1660 
1661  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1662  if (!NT_SUCCESS(Status)) {
1663  ERR("insert_tree_item returned %08lx\n", Status);
1664  return Status;
1665  }
1666  }
1667 
1668  le = le->Flink;
1669  }
1670  }
1671 
1672  return STATUS_SUCCESS;
1673 }
1674 
1676  KEY searchkey;
1677  traverse_ptr tp;
1678  NTSTATUS Status;
1679  bool b;
1680  LIST_ENTRY items, metadata_items, rollback, *le;
1681  uint64_t loaded = 0, num_loaded = 0;
1682  chunk* newchunk = NULL;
1683  uint8_t* data = NULL;
1684 
1685  TRACE("chunk %I64x\n", c->offset);
1686 
1689  InitializeListHead(&metadata_items);
1690 
1691  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1692 
1693  searchkey.obj_id = c->offset;
1694  searchkey.obj_type = TYPE_EXTENT_ITEM;
1695  searchkey.offset = 0xffffffffffffffff;
1696 
1697  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1698  if (!NT_SUCCESS(Status)) {
1699  ERR("find_item returned %08lx\n", Status);
1700  goto end;
1701  }
1702 
1703  do {
1704  traverse_ptr next_tp;
1705 
1706  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1707  break;
1708 
1709  if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1710  bool tree = false;
1711 
1712  if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1713  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1714 
1716  tree = true;
1717  }
1718 
1719  if (!tree) {
1720  Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1721 
1722  if (!NT_SUCCESS(Status)) {
1723  ERR("add_data_reloc returned %08lx\n", Status);
1724  goto end;
1725  }
1726 
1727  loaded += tp.item->key.offset;
1728  num_loaded++;
1729 
1730  if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1731  break;
1732  }
1733  }
1734 
1735  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1736 
1737  if (b)
1738  tp = next_tp;
1739  } while (b);
1740 
1741  if (IsListEmpty(&items)) {
1742  *changed = false;
1744  goto end;
1745  } else
1746  *changed = true;
1747 
1749  if (!data) {
1750  ERR("out of memory\n");
1752  goto end;
1753  }
1754 
1755  le = items.Flink;
1756  while (le != &items) {
1758  bool done = false;
1759  LIST_ENTRY* le2;
1760  void* csum;
1761  RTL_BITMAP bmp;
1762  ULONG* bmparr;
1763  ULONG bmplen, runlength, index, lastoff;
1764 
1765  if (newchunk) {
1766  acquire_chunk_lock(newchunk, Vcb);
1767 
1768  if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1769  newchunk->used += dr->size;
1770  space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1771  done = true;
1772  }
1773 
1774  release_chunk_lock(newchunk, Vcb);
1775  }
1776 
1777  if (!done) {
1778  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1779 
1780  le2 = Vcb->chunks.Flink;
1781  while (le2 != &Vcb->chunks) {
1782  chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1783 
1784  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1785  acquire_chunk_lock(c2, Vcb);
1786 
1787  if ((c2->chunk_item->size - c2->used) >= dr->size) {
1788  if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1789  c2->used += dr->size;
1790  space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1791  release_chunk_lock(c2, Vcb);
1792  newchunk = c2;
1793  done = true;
1794  break;
1795  }
1796  }
1797 
1798  release_chunk_lock(c2, Vcb);
1799  }
1800 
1801  le2 = le2->Flink;
1802  }
1803 
1804  // allocate new chunk if necessary
1805  if (!done) {
1806  Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1807 
1808  if (!NT_SUCCESS(Status)) {
1809  ERR("alloc_chunk returned %08lx\n", Status);
1810  ExReleaseResourceLite(&Vcb->chunk_lock);
1811  goto end;
1812  }
1813 
1814  acquire_chunk_lock(newchunk, Vcb);
1815 
1816  newchunk->balance_num = Vcb->balance.balance_num;
1817 
1818  if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1819  release_chunk_lock(newchunk, Vcb);
1820  ExReleaseResourceLite(&Vcb->chunk_lock);
1821  ERR("could not find address in new chunk\n");
1823  goto end;
1824  } else {
1825  newchunk->used += dr->size;
1826  space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1827  }
1828 
1829  release_chunk_lock(newchunk, Vcb);
1830  }
1831 
1832  ExReleaseResourceLite(&Vcb->chunk_lock);
1833  }
1834 
1835  dr->newchunk = newchunk;
1836 
1837  bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1838 
1839  bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1840  if (!bmparr) {
1841  ERR("out of memory\n");
1843  goto end;
1844  }
1845 
1846  csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * Vcb->csum_size / Vcb->superblock.sector_size), ALLOC_TAG);
1847  if (!csum) {
1848  ERR("out of memory\n");
1849  ExFreePool(bmparr);
1851  goto end;
1852  }
1853 
1854  RtlInitializeBitMap(&bmp, bmparr, bmplen);
1855  RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1856 
1857  searchkey.obj_id = EXTENT_CSUM_ID;
1858  searchkey.obj_type = TYPE_EXTENT_CSUM;
1859  searchkey.offset = dr->address;
1860 
1861  Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1862  if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1863  ERR("find_item returned %08lx\n", Status);
1864  ExFreePool(csum);
1865  ExFreePool(bmparr);
1866  goto end;
1867  }
1868 
1869  if (Status != STATUS_NOT_FOUND) {
1870  do {
1871  traverse_ptr next_tp;
1872 
1873  if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1874  if (tp.item->key.offset >= dr->address + dr->size)
1875  break;
1876  else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size) >= dr->address) {
1877  uint64_t cs = max(dr->address, tp.item->key.offset);
1878  uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size));
1879 
1880  RtlCopyMemory((uint8_t*)csum + ((cs - dr->address) * Vcb->csum_size / Vcb->superblock.sector_size),
1881  tp.item->data + ((cs - tp.item->key.offset) * Vcb->csum_size / Vcb->superblock.sector_size),
1882  (ULONG)((ce - cs) * Vcb->csum_size / Vcb->superblock.sector_size));
1883 
1884  RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1885 
1886  if (ce == dr->address + dr->size)
1887  break;
1888  }
1889  }
1890 
1891  if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1892  tp = next_tp;
1893  else
1894  break;
1895  } while (true);
1896  }
1897 
1898  lastoff = 0;
1899  runlength = RtlFindFirstRunClear(&bmp, &index);
1900 
1901  while (runlength != 0) {
1902  if (index >= bmplen)
1903  break;
1904 
1905  if (index + runlength >= bmplen) {
1906  runlength = bmplen - index;
1907 
1908  if (runlength == 0)
1909  break;
1910  }
1911 
1912  if (index > lastoff) {
1913  ULONG off = lastoff;
1914  ULONG size = index - lastoff;
1915 
1916  // handle no csum run
1917  do {
1918  ULONG rl;
1919 
1920  if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1921  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1922  else
1923  rl = size;
1924 
1925  Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1926  c, NULL, NULL, 0, false, NormalPagePriority);
1927  if (!NT_SUCCESS(Status)) {
1928  ERR("read_data returned %08lx\n", Status);
1929  ExFreePool(csum);
1930  ExFreePool(bmparr);
1931  goto end;
1932  }
1933 
1934  Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1935  NULL, newchunk, false, 0, NormalPagePriority);
1936  if (!NT_SUCCESS(Status)) {
1937  ERR("write_data_complete returned %08lx\n", Status);
1938  ExFreePool(csum);
1939  ExFreePool(bmparr);
1940  goto end;
1941  }
1942 
1943  size -= rl;
1944  off += rl;
1945  } while (size > 0);
1946  }
1947 
1948  add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL);
1949  add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1950 
1951  // handle csum run
1952  do {
1953  ULONG rl;
1954 
1955  if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1956  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1957  else
1958  rl = runlength;
1959 
1960  Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size,
1961  (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority);
1962  if (!NT_SUCCESS(Status)) {
1963  ERR("read_data returned %08lx\n", Status);
1964  ExFreePool(csum);
1965  ExFreePool(bmparr);
1966  goto end;
1967  }
1968 
1969  Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1970  NULL, newchunk, false, 0, NormalPagePriority);
1971  if (!NT_SUCCESS(Status)) {
1972  ERR("write_data_complete returned %08lx\n", Status);
1973  ExFreePool(csum);
1974  ExFreePool(bmparr);
1975  goto end;
1976  }
1977 
1978  runlength -= rl;
1979  index += rl;
1980  } while (runlength > 0);
1981 
1982  lastoff = index;
1983  runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1984  }
1985 
1986  ExFreePool(csum);
1987  ExFreePool(bmparr);
1988 
1989  // handle final nocsum run
1990  if (lastoff < dr->size / Vcb->superblock.sector_size) {
1991  ULONG off = lastoff;
1992  ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1993 
1994  do {
1995  ULONG rl;
1996 
1997  if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1998  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1999  else
2000  rl = size;
2001 
2002  Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
2003  c, NULL, NULL, 0, false, NormalPagePriority);
2004  if (!NT_SUCCESS(Status)) {
2005  ERR("read_data returned %08lx\n", Status);
2006  goto end;
2007  }
2008 
2009  Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2010  NULL, newchunk, false, 0, NormalPagePriority);
2011  if (!NT_SUCCESS(Status)) {
2012  ERR("write_data_complete returned %08lx\n", Status);
2013  goto end;
2014  }
2015 
2016  size -= rl;
2017  off += rl;
2018  } while (size > 0);
2019  }
2020 
2021  le = le->Flink;
2022  }
2023 
2024  ExFreePool(data);
2025  data = NULL;
2026 
2027  Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2028  if (!NT_SUCCESS(Status)) {
2029  ERR("write_metadata_items returned %08lx\n", Status);
2030  goto end;
2031  }
2032 
2033  le = items.Flink;
2034  while (le != &items) {
2036 
2038  if (!NT_SUCCESS(Status)) {
2039  ERR("add_data_reloc_extent_item returned %08lx\n", Status);
2040  goto end;
2041  }
2042 
2043  le = le->Flink;
2044  }
2045 
2046  le = c->changed_extents.Flink;
2047  while (le != &c->changed_extents) {
2048  LIST_ENTRY *le2, *le3;
2050 
2051  le3 = le->Flink;
2052 
2053  le2 = items.Flink;
2054  while (le2 != &items) {
2056 
2057  if (ce->address == dr->address) {
2058  ce->address = dr->new_address;
2061  break;
2062  }
2063 
2064  le2 = le2->Flink;
2065  }
2066 
2067  le = le3;
2068  }
2069 
2071 
2072  Vcb->need_write = true;
2073 
2074 end:
2075  if (NT_SUCCESS(Status)) {
2076  // update extents in cache inodes before we flush
2077  le = Vcb->chunks.Flink;
2078  while (le != &Vcb->chunks) {
2080 
2081  if (c2->cache) {
2082  LIST_ENTRY* le2;
2083 
2084  ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2085 
2086  le2 = c2->cache->extents.Flink;
2087  while (le2 != &c2->cache->extents) {
2089 
2090  if (!ext->ignore) {
2091  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2092  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2093 
2094  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2095  LIST_ENTRY* le3 = items.Flink;
2096  while (le3 != &items) {
2098 
2099  if (ed2->address == dr->address) {
2100  ed2->address = dr->new_address;
2101  break;
2102  }
2103 
2104  le3 = le3->Flink;
2105  }
2106  }
2107  }
2108  }
2109 
2110  le2 = le2->Flink;
2111  }
2112 
2113  ExReleaseResourceLite(c2->cache->Header.Resource);
2114  }
2115 
2116  le = le->Flink;
2117  }
2118 
2119  Status = do_write(Vcb, NULL);
2120  if (!NT_SUCCESS(Status))
2121  ERR("do_write returned %08lx\n", Status);
2122  }
2123 
2124  if (NT_SUCCESS(Status)) {
2126 
2127  // update open FCBs
2128  // FIXME - speed this up(?)
2129 
2130  le = Vcb->all_fcbs.Flink;
2131  while (le != &Vcb->all_fcbs) {
2132  struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2133  LIST_ENTRY* le2;
2134 
2135  ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2136 
2137  le2 = fcb->extents.Flink;
2138  while (le2 != &fcb->extents) {
2140 
2141  if (!ext->ignore) {
2142  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2143  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2144 
2145  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2146  LIST_ENTRY* le3 = items.Flink;
2147  while (le3 != &items) {
2149 
2150  if (ed2->address == dr->address) {
2151  ed2->address = dr->new_address;
2152  break;
2153  }
2154 
2155  le3 = le3->Flink;
2156  }
2157  }
2158  }
2159  }
2160 
2161  le2 = le2->Flink;
2162  }
2163 
2164  ExReleaseResourceLite(fcb->Header.Resource);
2165 
2166  le = le->Flink;
2167  }
2168  } else
2170 
2171  free_trees(Vcb);
2172 
2173  ExReleaseResourceLite(&Vcb->tree_lock);
2174 
2175  if (data)
2176  ExFreePool(data);
2177 
2178  while (!IsListEmpty(&items)) {
2180 
2181  while (!IsListEmpty(&dr->refs)) {
2183 
2184  ExFreePool(ref);
2185  }
2186 
2187  ExFreePool(dr);
2188  }
2189 
2190  while (!IsListEmpty(&metadata_items)) {
2192 
2193  while (!IsListEmpty(&mr->refs)) {
2195 
2196  ExFreePool(ref);
2197  }
2198 
2199  if (mr->data)
2200  ExFreePool(mr->data);
2201 
2202  ExFreePool(mr);
2203  }
2204 
2205  return Status;
2206 }
2207 
2208 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2209  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2210  return BLOCK_FLAG_RAID0;
2211  else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2212  return BLOCK_FLAG_RAID1;
2213  else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2214  return BLOCK_FLAG_DUPLICATE;
2215  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2216  return BLOCK_FLAG_RAID10;
2217  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2218  return BLOCK_FLAG_RAID5;
2219  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2220  return BLOCK_FLAG_RAID6;
2221  else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2222  return BLOCK_FLAG_RAID1C3;
2223  else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2224  return BLOCK_FLAG_RAID1C4;
2225  else
2226  return BLOCK_FLAG_SINGLE;
2227 }
2228 
2230  btrfs_balance_opts* opts;
2231 
2232  opts = &Vcb->balance.opts[sort];
2233 
2234  if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2235  return false;
2236 
2237  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2239 
2240  if (!(type & opts->profiles))
2241  return false;
2242  }
2243 
2244  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2245  uint16_t i;
2246  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2247  bool b = false;
2248 
2249  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2250  if (cis[i].dev_id == opts->devid) {
2251  b = true;
2252  break;
2253  }
2254  }
2255 
2256  if (!b)
2257  return false;
2258  }
2259 
2260  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2261  uint16_t i, factor;
2262  uint64_t physsize;
2263  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2264  bool b = false;
2265 
2266  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2267  factor = c->chunk_item->num_stripes;
2268  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2269  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2270  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2271  factor = c->chunk_item->num_stripes - 1;
2272  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2273  factor = c->chunk_item->num_stripes - 2;
2274  else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4
2275  factor = 1;
2276 
2277  physsize = c->chunk_item->size / factor;
2278 
2279  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2280  if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2281  (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2282  b = true;
2283  break;
2284  }
2285  }
2286 
2287  if (!b)
2288  return false;
2289  }
2290 
2291  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2292  if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2293  return false;
2294  }
2295 
2296  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2297  if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2298  return false;
2299  }
2300 
2301  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2302  uint64_t usage = c->used * 100 / c->chunk_item->size;
2303 
2304  // usage == 0 should mean completely empty, not just that usage rounds to 0%
2305  if (c->used > 0 && usage == 0)
2306  usage = 1;
2307 
2308  if (usage < opts->usage_start || usage > opts->usage_end)
2309  return false;
2310  }
2311 
2314 
2315  if (type == opts->convert)
2316  return false;
2317  }
2318 
2319  return true;
2320 }
2321 
2323  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2324  args->profiles = opts->profiles;
2326  }
2327 
2328  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2329  if (args->usage_start == 0) {
2331  args->usage_start = opts->usage_start;
2332  args->usage_end = opts->usage_end;
2333  } else {
2334  args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2335  args->usage = opts->usage_end;
2336  }
2337  }
2338 
2339  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2340  args->devid = opts->devid;
2341  args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2342  }
2343 
2344  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2345  args->drange_start = opts->drange_start;
2346  args->drange_end = opts->drange_end;
2347  args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2348  }
2349 
2350  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2351  args->vrange_start = opts->vrange_start;
2352  args->vrange_end = opts->vrange_end;
2353  args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2354  }
2355 
2356  if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2357  args->convert = opts->convert;
2358  args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2359 
2360  if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2361  args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2362  }
2363 
2364  if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2365  if (args->limit_start == 0) {
2367  args->limit_start = (uint32_t)opts->limit_start;
2368  args->limit_end = (uint32_t)opts->limit_end;
2369  } else {
2370  args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2371  args->limit = opts->limit_end;
2372  }
2373  }
2374 
2375  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2376  args->stripes_start = opts->stripes_start;
2377  args->stripes_end = opts->stripes_end;
2379  }
2380 }
2381 
2383  KEY searchkey;
2384  traverse_ptr tp;
2385  NTSTATUS Status;
2386  BALANCE_ITEM* bi;
2387 
2388  searchkey.obj_id = BALANCE_ITEM_ID;
2389  searchkey.obj_type = TYPE_TEMP_ITEM;
2390  searchkey.offset = 0;
2391 
2392  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2393 
2394  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2395  if (!NT_SUCCESS(Status)) {
2396  ERR("find_item returned %08lx\n", Status);
2397  goto end;
2398  }
2399 
2400  if (!keycmp(tp.item->key, searchkey)) {
2402  if (!NT_SUCCESS(Status)) {
2403  ERR("delete_tree_item returned %08lx\n", Status);
2404  goto end;
2405  }
2406  }
2407 
2409  if (!bi) {
2410  ERR("out of memory\n");
2412  goto end;
2413  }
2414 
2415  RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2416 
2417  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2418  bi->flags |= BALANCE_FLAGS_DATA;
2419  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2420  }
2421 
2422  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2424  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2425  }
2426 
2427  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2428  bi->flags |= BALANCE_FLAGS_SYSTEM;
2429  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2430  }
2431 
2432  Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2433  if (!NT_SUCCESS(Status)) {
2434  ERR("insert_tree_item returned %08lx\n", Status);
2435  ExFreePool(bi);
2436  goto end;
2437  }
2438 
2440 
2441 end:
2442  if (NT_SUCCESS(Status)) {
2443  Status = do_write(Vcb, NULL);
2444  if (!NT_SUCCESS(Status))
2445  ERR("do_write returned %08lx\n", Status);
2446  }
2447 
2448  free_trees(Vcb);
2449 
2450  ExReleaseResourceLite(&Vcb->tree_lock);
2451 
2452  return Status;
2453 }
2454 
2456  KEY searchkey;
2457  traverse_ptr tp;
2458  NTSTATUS Status;
2459 
2460  searchkey.obj_id = BALANCE_ITEM_ID;
2461  searchkey.obj_type = TYPE_TEMP_ITEM;
2462  searchkey.offset = 0;
2463 
2464  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2465 
2466  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2467  if (!NT_SUCCESS(Status)) {
2468  ERR("find_item returned %08lx\n", Status);
2469  goto end;
2470  }
2471 
2472  if (!keycmp(tp.item->key, searchkey)) {
2474  if (!NT_SUCCESS(Status)) {
2475  ERR("delete_tree_item returned %08lx\n", Status);
2476  goto end;
2477  }
2478 
2479  Status = do_write(Vcb, NULL);
2480  if (!NT_SUCCESS(Status)) {
2481  ERR("do_write returned %08lx\n", Status);
2482  goto end;
2483  }
2484 
2485  free_trees(Vcb);
2486  }
2487 
2489 
2490 end:
2491  ExReleaseResourceLite(&Vcb->tree_lock);
2492 
2493  return Status;
2494 }
2495 
2498 
2499  if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2501  opts->profiles = args->profiles;
2502  }
2503 
2504  if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2506 
2507  opts->usage_start = 0;
2508  opts->usage_end = (uint8_t)args->usage;
2509  } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2511 
2512  opts->usage_start = (uint8_t)args->usage_start;
2513  opts->usage_end = (uint8_t)args->usage_end;
2514  }
2515 
2516  if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2518  opts->devid = args->devid;
2519  }
2520 
2521  if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2523  opts->drange_start = args->drange_start;
2524  opts->drange_end = args->drange_end;
2525  }
2526 
2527  if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2529  opts->vrange_start = args->vrange_start;
2530  opts->vrange_end = args->vrange_end;
2531  }
2532 
2533  if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2535 
2536  opts->limit_start = 0;
2537  opts->limit_end = args->limit;
2538  } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2540 
2541  opts->limit_start = args->limit_start;
2542  opts->limit_end = args->limit_end;
2543  }
2544 
2545  if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2547 
2548  opts->stripes_start = (uint16_t)args->stripes_start;
2549  opts->stripes_end = (uint16_t)args->stripes_end;
2550  }
2551 
2552  if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2554  opts->convert = args->convert;
2555 
2556  if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2557  opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2558  }
2559 }
2560 
2562  NTSTATUS Status;
2563  superblock* sb;
2564  int i = 0;
2565 
2567  if (!sb) {
2568  ERR("out of memory\n");
2570  }
2571 
2572  RtlZeroMemory(sb, sizeof(superblock));
2573 
2574  while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2575  Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2576 
2577  if (!NT_SUCCESS(Status)) {
2578  ExFreePool(sb);
2579  return Status;
2580  }
2581 
2582  i++;
2583  }
2584 
2585  ExFreePool(sb);
2586 
2587  return STATUS_SUCCESS;
2588 }
2589 
2591  KEY searchkey;
2592  traverse_ptr tp;
2593  NTSTATUS Status;
2594  LIST_ENTRY* le;
2596 
2597  if (Vcb->need_write) {
2598  Status = do_write(Vcb, NULL);
2599 
2600  if (!NT_SUCCESS(Status))
2601  ERR("do_write returned %08lx\n", Status);
2602  } else
2604 
2605  free_trees(Vcb);
2606 
2607  if (!NT_SUCCESS(Status))
2608  return Status;
2609 
2610  // remove entry in chunk tree
2611 
2612  searchkey.obj_id = 1;
2613  searchkey.obj_type = TYPE_DEV_ITEM;
2614  searchkey.offset = dev->devitem.dev_id;
2615 
2616  Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2617  if (!NT_SUCCESS(Status)) {
2618  ERR("find_item returned %08lx\n", Status);
2619  return Status;
2620  }
2621 
2622  if (!keycmp(searchkey, tp.item->key)) {
2624 
2625  if (!NT_SUCCESS(Status)) {
2626  ERR("delete_tree_item returned %08lx\n", Status);
2627  return Status;
2628  }
2629  }
2630 
2631  // remove stats entry in device tree
2632 
2633  searchkey.obj_id = 0;
2634  searchkey.obj_type = TYPE_DEV_STATS;
2635  searchkey.offset = dev->devitem.dev_id;
2636 
2637  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2638  if (!NT_SUCCESS(Status)) {
2639  ERR("find_item returned %08lx\n", Status);
2640  return Status;
2641  }
2642 
2643  if (!keycmp(searchkey, tp.item->key)) {
2645 
2646  if (!NT_SUCCESS(Status)) {
2647  ERR("delete_tree_item returned %08lx\n", Status);
2648  return Status;
2649  }
2650  }
2651 
2652  // update superblock
2653 
2654  Vcb->superblock.num_devices--;
2655  Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2656  Vcb->devices_loaded--;
2657 
2658  RemoveEntryList(&dev->list_entry);
2659 
2660  // flush
2661 
2662  Status = do_write(Vcb, NULL);
2663  if (!NT_SUCCESS(Status))
2664  ERR("do_write returned %08lx\n", Status);
2665 
2666  free_trees(Vcb);
2667 
2668  if (!NT_SUCCESS(Status))
2669  return Status;
2670 
2671  if (!dev->readonly && dev->devobj) {
2673  if (!NT_SUCCESS(Status))
2674  WARN("remove_superblocks returned %08lx\n", Status);
2675  }
2676 
2677  // remove entry in volume list
2678 
2679  vde = Vcb->vde;
2680 
2681  if (dev->devobj) {
2682  pdo_device_extension* pdode = vde->pdode;
2683 
2685 
2686  le = pdode->children.Flink;
2687  while (le != &pdode->children) {
2689 
2690  if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2693  UNICODE_STRING mmdevpath;
2694 
2695  pdode->children_loaded--;
2696 
2697  if (vc->had_drive_letter) { // re-add entry to mountmgr
2700  if (!NT_SUCCESS(Status))
2701  ERR("IoGetDeviceObjectPointer returned %08lx\n", Status);
2702  else {
2703  MOUNTDEV_NAME mdn;
2704 
2705  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2707  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2708  else {
2709  MOUNTDEV_NAME* mdn2;
2710  ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2711 
2712  mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2713  if (!mdn2)
2714  ERR("out of memory\n");
2715  else {
2716  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2717  if (!NT_SUCCESS(Status))
2718  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2719  else {
2721 
2722  name.Buffer = mdn2->Name;
2723  name.Length = name.MaximumLength = mdn2->NameLength;
2724 
2726  if (!NT_SUCCESS(Status))
2727  WARN("mountmgr_add_drive_letter returned %08lx\n", Status);
2728  }
2729 
2730  ExFreePool(mdn2);
2731  }
2732  }
2733 
2735  }
2736  }
2737 
2738  ExFreePool(vc->pnp_name.Buffer);
2740  ExFreePool(vc);
2741 
2743 
2744  break;
2745  }
2746 
2747  le = le->Flink;
2748  }
2749 
2750  if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2751  vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2752 
2753  le = pdode->children.Flink;
2754  while (le != &pdode->children) {
2756 
2757  if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2758  vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2759  break;
2760  }
2761 
2762  le = le->Flink;
2763  }
2764  }
2765 
2766  pdode->num_children = Vcb->superblock.num_devices;
2767 
2769 
2770  // free dev
2771 
2772  if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2774  }
2775 
2776  while (!IsListEmpty(&dev->space)) {
2777  LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2779 
2780  ExFreePool(s);
2781  }
2782 
2783  ExFreePool(dev);
2784 
2785  if (Vcb->trim) {
2786  Vcb->trim = false;
2787 
2788  le = Vcb->devices.Flink;
2789  while (le != &Vcb->devices) {
2790  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2791 
2792  if (dev2->trim) {
2793  Vcb->trim = true;
2794  break;
2795  }
2796 
2797  le = le->Flink;
2798  }
2799  }
2800 
2802 
2803  return STATUS_SUCCESS;
2804 }
2805 
2808  DEVICE_DATA_SET_RANGE* ranges;
2809  ULONG datalen, i;
2810  KEY searchkey;
2811  traverse_ptr tp;
2812  NTSTATUS Status;
2813  bool b;
2814  uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2815  LIST_ENTRY* le;
2816 
2817  dev->num_trim_entries = 0;
2818 
2819  searchkey.obj_id = dev->devitem.dev_id;
2820  searchkey.obj_type = TYPE_DEV_EXTENT;
2821  searchkey.offset = 0;
2822 
2823  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2824  if (!NT_SUCCESS(Status)) {
2825  ERR("find_item returned %08lx\n", Status);
2826  return;
2827  }
2828 
2829  do {
2830  traverse_ptr next_tp;
2831 
2832  if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2833  if (tp.item->size >= sizeof(DEV_EXTENT)) {
2834  DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2835 
2836  if (tp.item->key.offset > lastoff)
2837  add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2838 
2839  lastoff = tp.item->key.offset + de->length;
2840  } else {
2841  ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2842  return;
2843  }
2844  }
2845 
2846  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2847 
2848  if (b) {
2849  tp = next_tp;
2850  if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2851  break;
2852  }
2853  } while (b);
2854 
2855  if (lastoff < dev->devitem.num_bytes)
2856  add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2857 
2858  if (dev->num_trim_entries == 0)
2859  return;
2860 
2861  datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2862 
2864  if (!dmdsa) {
2865  ERR("out of memory\n");
2866  goto end;
2867  }
2868 
2869  dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2870  dmdsa->Action = DeviceDsmAction_Trim;
2872  dmdsa->ParameterBlockOffset = 0;
2873  dmdsa->ParameterBlockLength = 0;
2875  dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2876 
2877  ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2878 
2879  i = 0;
2880  le = dev->trim_list.Flink;
2881  while (le != &dev->trim_list) {
2883 
2884  ranges[i].StartingOffset = s->address;
2885  ranges[i].LengthInBytes = s->size;
2886  i++;
2887 
2888  le = le->Flink;
2889  }
2890 
2892  if (!NT_SUCCESS(Status))
2893  WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status);
2894 
2895  ExFreePool(dmdsa);
2896 
2897 end:
2898  while (!IsListEmpty(&dev->trim_list)) {
2900  ExFreePool(s);
2901  }
2902 
2903  dev->num_trim_entries = 0;
2904 }
2905 
2907  NTSTATUS Status;
2908  bool changed;
2909  LIST_ENTRY* le;
2910  chunk* rc;
2911 
2912  // FIXME - allow with metadata chunks?
2913 
2914  while (true) {
2915  rc = NULL;
2916 
2917  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2918 
2919  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2920 
2921  // choose the least-used chunk we haven't looked at yet
2922  le = Vcb->chunks.Flink;
2923  while (le != &Vcb->chunks) {
2925 
2926  // FIXME - skip full-size chunks over e.g. 90% full?
2927  if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2928  rc = c;
2929 
2930  le = le->Flink;
2931  }
2932 
2933  ExReleaseResourceLite(&Vcb->chunk_lock);
2934 
2935  if (!rc) {
2936  ExReleaseResourceLite(&Vcb->tree_lock);
2937  break;
2938  }
2939 
2940  if (rc->list_entry_balance.Flink) {
2942  Vcb->balance.chunks_left--;
2943  }
2944 
2945  rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2946  rc->reloc = true;
2947 
2948  ExReleaseResourceLite(&Vcb->tree_lock);
2949 
2950  do {
2951  changed = false;
2952 
2953  Status = balance_data_chunk(Vcb, rc, &changed);
2954  if (!NT_SUCCESS(Status)) {
2955  ERR("balance_data_chunk returned %08lx\n", Status);
2956  Vcb->balance.status = Status;
2958  rc->reloc = false;
2959  return Status;
2960  }
2961 
2962  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2963 
2964  if (Vcb->readonly)
2965  Vcb->balance.stopping = true;
2966 
2967  if (Vcb->balance.stopping)
2968  return STATUS_SUCCESS;
2969  } while (changed);
2970 
2972 
2973  rc->changed = true;
2974  rc->space_changed = true;
2975  rc->balance_num = Vcb->balance.balance_num;
2976 
2977  Status = do_write(Vcb, NULL);
2978  if (!NT_SUCCESS(Status)) {
2979  ERR("do_write returned %08lx\n", Status);
2980  return Status;
2981  }
2982 
2983  free_trees(Vcb);
2984  }
2985 
2986  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2987 
2988  Status = alloc_chunk(Vcb, flags, &rc, true);
2989 
2990  ExReleaseResourceLite(&Vcb->chunk_lock);
2991 
2992  if (NT_SUCCESS(Status)) {
2993  *newchunk = rc;
2994  return Status;
2995  } else {
2996  ERR("alloc_chunk returned %08lx\n", Status);
2997  return Status;
2998  }
2999 }
3000 
3002  LIST_ENTRY* le;
3003 
3004  while (!IsListEmpty(&dev->space)) {
3006 
3007  ExFreePool(s);
3008  }
3009 
3010  // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3011 
3012  space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3013 
3014  le = Vcb->chunks.Flink;
3015  while (le != &Vcb->chunks) {
3016  uint16_t n;
3018  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3019 
3020  for (n = 0; n < c->chunk_item->num_stripes; n++) {
3021  uint64_t stripe_size = 0;
3022 
3023  if (cis[n].dev_id == dev->devitem.dev_id) {
3024  if (stripe_size == 0) {
3025  uint16_t factor;
3026 
3027  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3028  factor = c->chunk_item->num_stripes;
3029  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3030  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3031  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3032  factor = c->chunk_item->num_stripes - 1;
3033  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3034  factor = c->chunk_item->num_stripes - 2;
3035  else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4
3036  factor = 1;
3037 
3038  stripe_size = c->chunk_item->size / factor;
3039  }
3040 
3041  space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3042  }
3043  }
3044 
3045  le = le->Flink;
3046  }
3047 
3048  return STATUS_SUCCESS;
3049 }
3050 
3051 _Function_class_(KSTART_ROUTINE)
3052 void __stdcall balance_thread(void* context) {
3054  LIST_ENTRY chunks;
3055  LIST_ENTRY* le;
3056  uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3057  uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3058  NTSTATUS Status;
3059 
3060  Vcb->balance.balance_num++;
3061 
3062  Vcb->balance.stopping = false;
3063  KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3064 
3065  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3066  old_data_flags = Vcb->data_flags;
3067  Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3068 
3070  }
3071 
3072  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3073  old_metadata_flags = Vcb->metadata_flags;
3074  Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3075  }
3076 
3077  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3078  old_system_flags = Vcb->system_flags;
3079  Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3080  }
3081 
3082  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3083  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3084  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3085  else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3086  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3087  }
3088 
3089  num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3090  Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3091 
3092  InitializeListHead(&chunks);
3093 
3094  // FIXME - what are we supposed to do with limit_start?
3095 
3096  if (!Vcb->readonly) {
3097  if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3099  if (!NT_SUCCESS(Status)) {
3100  ERR("add_balance_item returned %08lx\n", Status);
3101  Vcb->balance.status = Status;
3102  goto end;
3103  }
3104  } else {
3105  if (Vcb->need_write) {
3106  Status = do_write(Vcb, NULL);
3107 
3108  free_trees(Vcb);
3109 
3110  if (!NT_SUCCESS(Status)) {
3111  ERR("do_write returned %08lx\n", Status);
3112  Vcb->balance.status = Status;
3113  goto end;
3114  }
3115  }
3116  }
3117  }
3118 
3119  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3120 
3121  if (Vcb->balance.stopping)
3122  goto end;
3123 
3124  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3125 
3126  le = Vcb->chunks.Flink;
3127  while (le != &Vcb->chunks) {
3129  uint8_t sort;
3130 
3132 
3133  if (c->chunk_item->type & BLOCK_FLAG_DATA)
3135  else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3137  else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3139  else {
3140  ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3142  break;
3143  }
3144 
3145  if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3147  InsertTailList(&chunks, &c->list_entry_balance);
3148 
3149  num_chunks[sort]++;
3150  Vcb->balance.total_chunks++;
3151  Vcb->balance.chunks_left++;
3152  } else if (sort == BALANCE_OPTS_METADATA)
3153  okay_metadata_chunks++;
3154  else if (sort == BALANCE_OPTS_DATA)
3155  okay_data_chunks++;
3156  else if (sort == BALANCE_OPTS_SYSTEM)
3157  okay_system_chunks++;
3158 
3159  if (!c->cache_loaded) {
3161 
3162  if (!NT_SUCCESS(Status)) {
3163  ERR("load_cache_chunk returned %08lx\n", Status);
3164  Vcb->balance.status = Status;
3166  ExReleaseResourceLite(&Vcb->chunk_lock);
3167  goto end;
3168  }
3169  }
3170 
3172 
3173  le = le->Flink;
3174  }
3175 
3176  ExReleaseResourceLite(&Vcb->chunk_lock);
3177 
3178  // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3179  if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3180  bool consolidated = false;
3181  chunk* c;
3182 
3183  if (okay_metadata_chunks == 0) {
3184  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3185 
3186  Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3187  if (NT_SUCCESS(Status))
3188  c->balance_num = Vcb->balance.balance_num;
3189  else if (Status != STATUS_DISK_FULL || consolidated) {
3190  ERR("alloc_chunk returned %08lx\n", Status);
3191  ExReleaseResourceLite(&Vcb->chunk_lock);
3192  Vcb->balance.status = Status;
3193  goto end;
3194  }
3195 
3196  ExReleaseResourceLite(&Vcb->chunk_lock);
3197 
3198  if (Status == STATUS_DISK_FULL) {
3199  Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3200  if (!NT_SUCCESS(Status)) {
3201  ERR("try_consolidation returned %08lx\n", Status);
3202  Vcb->balance.status = Status;
3203  goto end;
3204  } else
3205  c->balance_num = Vcb->balance.balance_num;
3206 
3207  consolidated = true;
3208 
3209  if (Vcb->balance.stopping)
3210  goto end;
3211  }
3212  }
3213 
3214  if (okay_data_chunks == 0) {
3215  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3216 
3217  Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3218  if (NT_SUCCESS(Status))
3219  c->balance_num = Vcb->balance.balance_num;
3220  else if (Status != STATUS_DISK_FULL || consolidated) {
3221  ERR("alloc_chunk returned %08lx\n", Status);
3222  ExReleaseResourceLite(&Vcb->chunk_lock);
3223  Vcb->balance.status = Status;
3224  goto end;
3225  }
3226 
3227  ExReleaseResourceLite(&Vcb->chunk_lock);
3228 
3229  if (Status == STATUS_DISK_FULL) {
3230  Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3231  if (!NT_SUCCESS(Status)) {
3232  ERR("try_consolidation returned %08lx\n", Status);
3233  Vcb->balance.status = Status;
3234  goto end;
3235  } else
3236  c->balance_num = Vcb->balance.balance_num;
3237 
3238  consolidated = true;
3239 
3240  if (Vcb->balance.stopping)
3241  goto end;
3242  }
3243  }
3244 
3245  if (okay_system_chunks == 0) {
3246  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3247 
3248  Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3249  if (NT_SUCCESS(Status))
3250  c->balance_num = Vcb->balance.balance_num;
3251  else if (Status != STATUS_DISK_FULL || consolidated) {
3252  ERR("alloc_chunk returned %08lx\n", Status);
3253  ExReleaseResourceLite(&Vcb->chunk_lock);
3254  Vcb->balance.status = Status;
3255  goto end;
3256  }
3257 
3258  ExReleaseResourceLite(&Vcb->chunk_lock);
3259 
3260  if (Status == STATUS_DISK_FULL) {
3261  Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3262  if (!NT_SUCCESS(Status)) {
3263  ERR("try_consolidation returned %08lx\n", Status);
3264  Vcb->balance.status = Status;
3265  goto end;
3266  } else
3267  c->balance_num = Vcb->balance.balance_num;
3268 
3269  consolidated = true;
3270 
3271  if (Vcb->balance.stopping)
3272  goto end;
3273  }
3274  }
3275  }
3276 
3277  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3278 
3279  le = chunks.Flink;
3280  while (le != &chunks) {
3281  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3282 
3283  c->reloc = true;
3284 
3285  le = le->Flink;
3286  }
3287 
3288  ExReleaseResourceLite(&Vcb->chunk_lock);
3289 
3290  // do data chunks before metadata
3291  le = chunks.Flink;
3292  while (le != &chunks) {
3293  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3294  LIST_ENTRY* le2 = le->Flink;
3295 
3296  if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3297  bool changed;
3298 
3299  do {
3300  changed = false;
3301 
3302  Status = balance_data_chunk(Vcb, c, &changed);
3303  if (!NT_SUCCESS(Status)) {
3304  ERR("balance_data_chunk returned %08lx\n", Status);
3305  Vcb->balance.status = Status;
3306  goto end;
3307  }
3308 
3309  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3310 
3311  if (Vcb->readonly)
3312  Vcb->balance.stopping = true;
3313 
3314  if (Vcb->balance.stopping)
3315  break;
3316  } while (changed);
3317 
3318  c->changed = true;
3319  c->space_changed = true;
3320  }
3321 
3322  if (Vcb->balance.stopping)
3323  goto end;
3324 
3325  if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3326  (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3327  RemoveEntryList(&c->list_entry_balance);
3328  c->list_entry_balance.Flink = NULL;
3329 
3330  Vcb->balance.chunks_left--;
3331  }
3332 
3333  le = le2;
3334  }
3335 
3336  // do metadata chunks
3337  while (!IsListEmpty(&chunks)) {
3338  chunk* c;
3339  bool changed;
3340 
3341  le = RemoveHeadList(&chunks);
3342  c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3343 
3344  if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3345  do {
3346  Status = balance_metadata_chunk(Vcb, c, &changed);
3347  if (!NT_SUCCESS(Status)) {
3348  ERR("balance_metadata_chunk returned %08lx\n", Status);
3349  Vcb->balance.status = Status;
3350  goto end;
3351  }
3352 
3353  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3354 
3355  if (Vcb->readonly)
3356  Vcb->balance.stopping = true;
3357 
3358  if (Vcb->balance.stopping)
3359  break;
3360  } while (changed);
3361 
3362  c->changed = true;
3363  c->space_changed = true;
3364  }
3365 
3366  if (Vcb->balance.stopping)
3367  break;
3368 
3369  c->list_entry_balance.Flink = NULL;
3370 
3371  Vcb->balance.chunks_left--;
3372  }
3373 
3374 end:
3375  if (!Vcb->readonly) {
3376  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3377  le = chunks.Flink;
3378  while (le != &chunks) {
3379  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3380  c->reloc = false;
3381 
3382  le = le->Flink;
3383  c->list_entry_balance.Flink = NULL;
3384  }
3385 
3386  if (old_data_flags != 0)
3387  Vcb->data_flags = old_data_flags;
3388 
3389  if (old_metadata_flags != 0)
3390  Vcb->metadata_flags = old_metadata_flags;
3391 
3392  if (old_system_flags != 0)
3393  Vcb->system_flags = old_system_flags;
3394  }
3395 
3396  if (Vcb->balance.removing) {
3397  device* dev = NULL;
3398 
3399  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3400 
3401  le = Vcb->devices.Flink;
3402  while (le != &Vcb->devices) {
3403  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3404 
3405  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3406  dev = dev2;
3407  break;
3408  }
3409 
3410  le = le->Flink;
3411  }
3412 
3413  if (dev) {
3414  if (Vcb->balance.chunks_left == 0) {
3416 
3417  if (!NT_SUCCESS(Status)) {
3418  ERR("finish_removing_device returned %08lx\n", Status);
3419  dev->reloc = false;
3420  }
3421  } else
3422  dev->reloc = false;
3423  }
3424 
3425  ExReleaseResourceLite(&Vcb->tree_lock);
3426  } else if (Vcb->balance.shrinking) {
3427  device* dev = NULL;
3428 
3429  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3430 
3431  le = Vcb->devices.Flink;
3432  while (le != &Vcb->devices) {
3433  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3434 
3435  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3436  dev = dev2;
3437  break;
3438  }
3439 
3440  le = le->Flink;
3441  }
3442 
3443  if (!dev) {
3444  ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3445  Vcb->balance.status = STATUS_INTERNAL_ERROR;
3446  }
3447 
3448  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3449  if (dev) {
3451  if (!NT_SUCCESS(Status))
3452  WARN("regenerate_space_list returned %08lx\n", Status);
3453  }
3454  } else {
3455  uint64_t old_size;
3456 
3457  old_size = dev->devitem.num_bytes;
3458  dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3459 
3461  if (!NT_SUCCESS(Status)) {
3462  ERR("update_dev_item returned %08lx\n", Status);
3463  dev->devitem.num_bytes = old_size;
3464  Vcb->balance.status = Status;
3465 
3467  if (!NT_SUCCESS(Status))
3468  WARN("regenerate_space_list returned %08lx\n", Status);
3469  } else {
3470  Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3471 
3472  Status = do_write(Vcb, NULL);
3473  if (!NT_SUCCESS(Status))
3474  ERR("do_write returned %08lx\n", Status);
3475 
3476  free_trees(Vcb);
3477  }
3478  }
3479 
3480  ExReleaseResourceLite(&Vcb->tree_lock);
3481 
3482  if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3484  } else {
3486  if (!NT_SUCCESS(Status)) {
3487  ERR("remove_balance_item returned %08lx\n", Status);
3488  goto end;
3489  }
3490  }
3491 
3492  if (Vcb->trim && !Vcb->options.no_trim) {
3493  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3494 
3495  le = Vcb->devices.Flink;
3496  while (le != &Vcb->devices) {
3497  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3498 
3499  if (dev2->devobj && !dev2->readonly && dev2->trim)
3500  trim_unalloc_space(Vcb, dev2);
3501 
3502  le = le->Flink;
3503  }
3504 
3505  ExReleaseResourceLite(&Vcb->tree_lock);
3506  }
3507  }
3508 
3509  ZwClose(Vcb->balance.thread);
3510  Vcb->balance.thread = NULL;
3511 
3512  KeSetEvent(&Vcb->balance.finished, 0, false);
3513 }
3514 
3516  NTSTATUS Status;
3518  OBJECT_ATTRIBUTES oa;
3519  uint8_t i;
3520 
3521  if (length < sizeof(btrfs_start_balance) || !data)
3522  return STATUS_INVALID_PARAMETER;
3523 
3524  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3526 
3527  if (Vcb->locked) {
3528  WARN("cannot start balance while locked\n");
3529  return STATUS_DEVICE_NOT_READY;
3530  }
3531 
3532  if (Vcb->scrub.thread) {
3533  WARN("cannot start balance while scrub running\n");
3534  return STATUS_DEVICE_NOT_READY;
3535  }
3536 
3537  if (Vcb->balance.thread) {
3538  WARN("balance already running\n");
3539  return STATUS_DEVICE_NOT_READY;
3540  }
3541 
3542  if (Vcb->readonly)
3544 
3548  return STATUS_SUCCESS;
3549 
3550  for (i = 0; i < 3; i++) {
3551  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3552  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3556 
3557  if (bsb->opts[i].profiles == 0)
3558  return STATUS_INVALID_PARAMETER;
3559  }
3560 
3561  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3562  if (bsb->opts[i].devid == 0)
3563  return STATUS_INVALID_PARAMETER;
3564  }
3565 
3566  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3567  if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3568  return STATUS_INVALID_PARAMETER;
3569  }
3570 
3571  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3572  if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3573  return STATUS_INVALID_PARAMETER;
3574  }
3575 
3576  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3577  bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3578  bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3579 
3580  if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3581  return STATUS_INVALID_PARAMETER;
3582  }
3583 
3584  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3585  bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3586  bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3587 
3588  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3589  return STATUS_INVALID_PARAMETER;
3590  }
3591 
3592  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3593  bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3594  bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3595 
3596  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3597  return STATUS_INVALID_PARAMETER;
3598  }
3599 
3600  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3601  if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3603  bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3605  bsb->opts[i].convert != BLOCK_FLAG_RAID1C4)
3606  return STATUS_INVALID_PARAMETER;
3607  }
3608  }
3609  }
3610 
3611  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3614 
3615  Vcb->balance.paused = false;
3616  Vcb->balance.removing = false;
3617  Vcb->balance.shrinking = false;
3618  Vcb->balance.status = STATUS_SUCCESS;
3619  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3620 
3622 
3623  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3624  if (!NT_SUCCESS(Status)) {
3625  ERR("PsCreateSystemThread returned %08lx\n", Status);
3626  return Status;
3627  }
3628 
3629  return STATUS_SUCCESS;
3630 }
3631 
3633  KEY searchkey;
3634  traverse_ptr tp;
3635  NTSTATUS Status;
3636  BALANCE_ITEM* bi;
3637  OBJECT_ATTRIBUTES oa;
3638  int i;
3639 
3640  searchkey.obj_id = BALANCE_ITEM_ID;
3641  searchkey.obj_type = TYPE_TEMP_ITEM;
3642  searchkey.offset = 0;
3643 
3644  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3645  if (!NT_SUCCESS(Status)) {
3646  ERR("find_item returned %08lx\n", Status);
3647  return Status;
3648  }
3649 
3650  if (keycmp(tp.item->key, searchkey)) {
3651  TRACE("no balance item found\n");
3652  return STATUS_NOT_FOUND;
3653  }
3654 
3655  if (tp.item->size < sizeof(BALANCE_ITEM)) {
3656  WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3657  tp.item->size, sizeof(BALANCE_ITEM));
3658  return STATUS_INTERNAL_ERROR;
3659  }
3660 
3661  bi = (BALANCE_ITEM*)tp.item->data;
3662 
3663  if (bi->flags & BALANCE_FLAGS_DATA)
3664  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3665 
3666  if (bi->flags & BALANCE_FLAGS_METADATA)
3667  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3668 
3669  if (bi->flags & BALANCE_FLAGS_SYSTEM)
3670  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3671 
3672  // do the heuristics that Linux driver does
3673 
3674  for (i = 0; i < 3; i++) {
3675  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3676  // if converting, don't redo chunks already done
3677 
3678  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3679  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3680 
3681  // don't balance chunks more than 90% filled - presumably these
3682  // have already been done
3683 
3684  if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3685  !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3686  ) {
3687  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3688  Vcb->balance.opts[i].usage_start = 0;
3689  Vcb->balance.opts[i].usage_end = 90;
3690  }
3691  }
3692  }
3693 
3694  if (Vcb->readonly || Vcb->options.skip_balance)
3695  Vcb->balance.paused = true;
3696  else
3697  Vcb->balance.paused = false;
3698 
3699  Vcb->balance.removing = false;
3700  Vcb->balance.shrinking = false;
3701  Vcb->balance.status = STATUS_SUCCESS;
3702  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3703 
3705 
3706  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3707  if (!NT_SUCCESS(Status)) {
3708  ERR("PsCreateSystemThread returned %08lx\n", Status);
3709  return Status;
3710  }
3711 
3712  return STATUS_SUCCESS;
3713 }
3714 
3717 
3718  if (length < sizeof(btrfs_query_balance) || !data)
3719  return STATUS_INVALID_PARAMETER;
3720 
3721  if (!Vcb->balance.thread) {
3723 
3724  if (!NT_SUCCESS(Vcb->balance.status)) {
3725  bqb->status |= BTRFS_BALANCE_ERROR;
3726  bqb->error = Vcb->balance.status;
3727  }
3728 
3729  return STATUS_SUCCESS;
3730  }
3731 
3732  bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3733 
3734  if (Vcb->balance.removing)
3735  bqb->status |= BTRFS_BALANCE_REMOVAL;
3736 
3737  if (Vcb->balance.shrinking)
3739 
3740  if (!NT_SUCCESS(Vcb->balance.status))
3741  bqb->status |= BTRFS_BALANCE_ERROR;
3742 
3743  bqb->chunks_left = Vcb->balance.chunks_left;
3744  bqb->total_chunks = Vcb->balance.total_chunks;
3745  bqb->error = Vcb->balance.status;
3746  RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3747  RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3748  RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3749 
3750  return STATUS_SUCCESS;
3751 }
3752 
3754  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3756 
3757  if (!Vcb->balance.thread)
3758  return STATUS_DEVICE_NOT_READY;
3759 
3760  if (Vcb->balance.paused)
3761  return STATUS_DEVICE_NOT_READY;
3762 
3763  Vcb->balance.paused = true;
3764  KeClearEvent(&Vcb->balance.event);
3765 
3766  return STATUS_SUCCESS;
3767 }
3768 
3770  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3772 
3773  if (!Vcb->balance.thread)
3774  return STATUS_DEVICE_NOT_READY;
3775 
3776  if (!Vcb->balance.paused)
3777  return STATUS_DEVICE_NOT_READY;
3778 
3779  if (Vcb->readonly)
3781 
3782  Vcb->balance.paused = false;
3783  KeSetEvent(&Vcb->balance.event, 0, false);
3784 
3785  return STATUS_SUCCESS;
3786 }
3787 
3789  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3791 
3792  if (!Vcb->balance.thread)
3793  return STATUS_DEVICE_NOT_READY;
3794 
3795  Vcb->balance.paused = false;
3796  Vcb->balance.stopping = true;
3797  Vcb->balance.status = STATUS_SUCCESS;
3798  KeSetEvent(&Vcb->balance.event, 0, false);
3799 
3800  return STATUS_SUCCESS;
3801 }
3802 
3804  uint64_t devid;
3805  LIST_ENTRY* le;
3806  device* dev = NULL;
3807  NTSTATUS Status;
3808  int i;
3809  uint64_t num_rw_devices;
3810  OBJECT_ATTRIBUTES oa;
3811 
3812  TRACE("(%p, %p, %lx)\n", Vcb, data, length);
3813 
3814  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3816 
3817  if (length < sizeof(uint64_t))
3818  return STATUS_INVALID_PARAMETER;
3819 
3820  devid = *(uint64_t*)data;
3821 
3822  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3823 
3824  if (Vcb->readonly) {
3825  ExReleaseResourceLite(&Vcb->tree_lock);
3827  }
3828 
3829  num_rw_devices = 0;
3830 
3831  le = Vcb->devices.Flink;
3832  while (le != &Vcb->devices) {
3833  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3834 
3835  if (dev2->devitem.dev_id == devid)
3836  dev = dev2;
3837 
3838  if (!dev2->readonly)
3839  num_rw_devices++;
3840 
3841  le = le->Flink;
3842  }
3843 
3844  if (!dev) {
3845  ExReleaseResourceLite(&Vcb->tree_lock);
3846  WARN("device %I64x not found\n", devid);
3847  return STATUS_NOT_FOUND;
3848  }
3849 
3850  if (!dev->readonly) {
3851  if (num_rw_devices == 1) {
3852  ExReleaseResourceLite(&Vcb->tree_lock);
3853  WARN("not removing last non-readonly device\n");
3854  return STATUS_INVALID_PARAMETER;
3855  }
3856 
3857  if (num_rw_devices == 4 &&
3858  ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3859  (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) ||
3860  (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4)
3861  )
3862  ) {
3863  ExReleaseResourceLite(&Vcb->tree_lock);
3864  ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n");
3865  return STATUS_CANNOT_DELETE;
3866  }
3867 
3868  if (num_rw_devices == 3 &&
3869  ((Vcb->data_flags & BLOCK_FLAG_RAID5 |