ReactOS  0.4.14-dev-77-gd9e7c48
balance.c
Go to the documentation of this file.
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21 
22 typedef struct {
27  tree* t;
28  bool system;
32 
33 typedef struct {
36 
37  union {
40  };
41 
43  bool top;
46 
47 typedef struct {
55 } data_reloc;
56 
57 typedef struct {
60 
61  union {
64  };
65 
69 
70 #ifndef _MSC_VER // not in mingw yet
71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
72 #endif
73 
74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
75 
77  bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
79  metadata_reloc* mr;
80  EXTENT_ITEM* ei;
81  uint16_t len;
82  uint64_t inline_rc;
83  uint8_t* ptr;
84 
86  if (!mr) {
87  ERR("out of memory\n");
89  }
90 
91  mr->address = tp->item->key.obj_id;
92  mr->data = NULL;
93  mr->ei = (EXTENT_ITEM*)tp->item->data;
94  mr->system = false;
96 
98  if (!NT_SUCCESS(Status)) {
99  ERR("delete_tree_item returned %08x\n", Status);
100  ExFreePool(mr);
101  return Status;
102  }
103 
104  if (!c)
106 
107  if (c) {
109 
110  c->used -= Vcb->superblock.node_size;
111 
112  space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
113 
115  }
116 
117  ei = (EXTENT_ITEM*)tp->item->data;
118  inline_rc = 0;
119 
120  len = tp->item->size - sizeof(EXTENT_ITEM);
121  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
122  if (!skinny) {
123  len -= sizeof(EXTENT_ITEM2);
124  ptr += sizeof(EXTENT_ITEM2);
125  }
126 
127  while (len > 0) {
128  uint8_t secttype = *ptr;
129  uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
131 
132  len--;
133 
134  if (sectlen > len) {
135  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
136  return STATUS_INTERNAL_ERROR;
137  }
138 
139  if (sectlen == 0) {
140  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
141  return STATUS_INTERNAL_ERROR;
142  }
143 
145  if (!ref) {
146  ERR("out of memory\n");
148  }
149 
150  if (secttype == TYPE_TREE_BLOCK_REF) {
151  ref->type = TYPE_TREE_BLOCK_REF;
152  RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
153  inline_rc++;
154  } else if (secttype == TYPE_SHARED_BLOCK_REF) {
155  ref->type = TYPE_SHARED_BLOCK_REF;
156  RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
157  inline_rc++;
158  } else {
159  ERR("unexpected tree type %x\n", secttype);
160  ExFreePool(ref);
161  return STATUS_INTERNAL_ERROR;
162  }
163 
164  ref->parent = NULL;
165  ref->top = false;
167 
168  len -= sectlen;
169  ptr += sizeof(uint8_t) + sectlen;
170  }
171 
172  if (inline_rc < ei->refcount) { // look for non-inline entries
173  traverse_ptr tp2 = *tp, next_tp;
174 
175  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
176  tp2 = next_tp;
177 
178  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
179  if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
181  if (!ref) {
182  ERR("out of memory\n");
184  }
185 
186  ref->type = TYPE_TREE_BLOCK_REF;
187  ref->tbr.offset = tp2.item->key.offset;
188  ref->parent = NULL;
189  ref->top = false;
191 
192  Status = delete_tree_item(Vcb, &tp2);
193  if (!NT_SUCCESS(Status)) {
194  ERR("delete_tree_item returned %08x\n", Status);
195  return Status;
196  }
197  } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
199  if (!ref) {
200  ERR("out of memory\n");
202  }
203 
204  ref->type = TYPE_SHARED_BLOCK_REF;
205  ref->sbr.offset = tp2.item->key.offset;
206  ref->parent = NULL;
207  ref->top = false;
209 
210  Status = delete_tree_item(Vcb, &tp2);
211  if (!NT_SUCCESS(Status)) {
212  ERR("delete_tree_item returned %08x\n", Status);
213  return Status;
214  }
215  }
216  } else
217  break;
218  }
219  }
220 
222 
223  if (mr2)
224  *mr2 = mr;
225 
226  return STATUS_SUCCESS;
227 }
228 
231  LIST_ENTRY* le;
232  KEY searchkey;
234  bool skinny = false;
236 
237  le = items->Flink;
238  while (le != items) {
240 
241  if (mr->address == address) {
242  *mr2 = mr;
243  return STATUS_SUCCESS;
244  }
245 
246  le = le->Flink;
247  }
248 
249  searchkey.obj_id = address;
250  searchkey.obj_type = TYPE_METADATA_ITEM;
251  searchkey.offset = 0xffffffffffffffff;
252 
253  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
254  if (!NT_SUCCESS(Status)) {
255  ERR("find_item returned %08x\n", Status);
256  return Status;
257  }
258 
260  skinny = true;
261  else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
262  tp.item->size >= sizeof(EXTENT_ITEM)) {
264 
265  if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
266  ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
267  return STATUS_INTERNAL_ERROR;
268  }
269  } else {
270  ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
271  return STATUS_INTERNAL_ERROR;
272  }
273 
274  Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
275  if (!NT_SUCCESS(Status)) {
276  ERR("add_metadata_reloc returned %08x\n", Status);
277  return Status;
278  }
279 
280  return STATUS_SUCCESS;
281 }
282 
284  LIST_ENTRY newlist, *le;
285 
286  if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
287  return;
288 
289  // insertion sort
290 
291  InitializeListHead(&newlist);
292 
293  while (!IsListEmpty(&mr->refs)) {
295  bool inserted = false;
296 
297  if (ref->type == TYPE_TREE_BLOCK_REF)
298  ref->hash = ref->tbr.offset;
299  else if (ref->type == TYPE_SHARED_BLOCK_REF)
300  ref->hash = ref->parent->new_address;
301 
302  le = newlist.Flink;
303  while (le != &newlist) {
305 
306  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
308  inserted = true;
309  break;
310  }
311 
312  le = le->Flink;
313  }
314 
315  if (!inserted)
316  InsertTailList(&newlist, &ref->list_entry);
317  }
318 
319  newlist.Flink->Blink = &mr->refs;
320  newlist.Blink->Flink = &mr->refs;
321  mr->refs.Flink = newlist.Flink;
322  mr->refs.Blink = newlist.Blink;
323 }
324 
327  LIST_ENTRY* le;
328  uint64_t rc = 0;
329  uint16_t inline_len;
330  bool all_inline = true;
331  metadata_reloc_ref* first_noninline = NULL;
332  EXTENT_ITEM* ei;
333  uint8_t* ptr;
334 
335  inline_len = sizeof(EXTENT_ITEM);
336  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
337  inline_len += sizeof(EXTENT_ITEM2);
338 
340 
341  le = mr->refs.Flink;
342  while (le != &mr->refs) {
344  uint16_t extlen = 0;
345 
346  rc++;
347 
348  if (ref->type == TYPE_TREE_BLOCK_REF)
349  extlen += sizeof(TREE_BLOCK_REF);
350  else if (ref->type == TYPE_SHARED_BLOCK_REF)
351  extlen += sizeof(SHARED_BLOCK_REF);
352 
353  if (all_inline) {
354  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
355  all_inline = false;
356  first_noninline = ref;
357  } else
358  inline_len += extlen + 1;
359  }
360 
361  le = le->Flink;
362  }
363 
364  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
365  if (!ei) {
366  ERR("out of memory\n");
368  }
369 
370  ei->refcount = rc;
371  ei->generation = mr->ei->generation;
372  ei->flags = mr->ei->flags;
373  ptr = (uint8_t*)&ei[1];
374 
375  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
376  EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
377 
378  ei2->firstitem = *(KEY*)&mr->data[1];
379  ei2->level = mr->data->level;
380 
381  ptr += sizeof(EXTENT_ITEM2);
382  }
383 
384  le = mr->refs.Flink;
385  while (le != &mr->refs) {
387 
388  if (ref == first_noninline)
389  break;
390 
391  *ptr = ref->type;
392  ptr++;
393 
394  if (ref->type == TYPE_TREE_BLOCK_REF) {
396 
397  tbr->offset = ref->tbr.offset;
398 
399  ptr += sizeof(TREE_BLOCK_REF);
400  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
402 
403  sbr->offset = ref->parent->new_address;
404 
405  ptr += sizeof(SHARED_BLOCK_REF);
406  }
407 
408  le = le->Flink;
409  }
410 
411  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
412  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
413  else
414  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
415 
416  if (!NT_SUCCESS(Status)) {
417  ERR("insert_tree_item returned %08x\n", Status);
418  ExFreePool(ei);
419  return Status;
420  }
421 
422  if (!all_inline) {
423  le = &first_noninline->list_entry;
424 
425  while (le != &mr->refs) {
427 
428  if (ref->type == TYPE_TREE_BLOCK_REF) {
429  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
430  if (!NT_SUCCESS(Status)) {
431  ERR("insert_tree_item returned %08x\n", Status);
432  return Status;
433  }
434  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
435  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
436  if (!NT_SUCCESS(Status)) {
437  ERR("insert_tree_item returned %08x\n", Status);
438  return Status;
439  }
440  }
441 
442  le = le->Flink;
443  }
444  }
445 
447  if (mr->data->level > 0) {
448  uint16_t i;
449  internal_node* in = (internal_node*)&mr->data[1];
450 
451  for (i = 0; i < mr->data->num_items; i++) {
453 
454  if (sbrrc > 0) {
455  SHARED_BLOCK_REF sbr;
456 
457  sbr.offset = mr->new_address;
458 
459  Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
460  if (!NT_SUCCESS(Status)) {
461  ERR("increase_extent_refcount returned %08x\n", Status);
462  return Status;
463  }
464 
465  sbr.offset = mr->address;
466 
467  Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
468  sbr.offset, false, NULL);
469  if (!NT_SUCCESS(Status)) {
470  ERR("decrease_extent_refcount returned %08x\n", Status);
471  return Status;
472  }
473  }
474  }
475  } else {
476  uint16_t i;
477  leaf_node* ln = (leaf_node*)&mr->data[1];
478 
479  for (i = 0; i < mr->data->num_items; i++) {
480  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
481  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
482 
485 
486  if (ed2->size > 0) { // not sparse
488 
489  if (sdrrc > 0) {
490  SHARED_DATA_REF sdr;
491  chunk* c;
492 
493  sdr.offset = mr->new_address;
494  sdr.count = sdrrc;
495 
497  if (!NT_SUCCESS(Status)) {
498  ERR("increase_extent_refcount returned %08x\n", Status);
499  return Status;
500  }
501 
502  sdr.offset = mr->address;
503 
505  sdr.offset, false, NULL);
506  if (!NT_SUCCESS(Status)) {
507  ERR("decrease_extent_refcount returned %08x\n", Status);
508  return Status;
509  }
510 
512 
513  if (c) {
514  // check changed_extents
515 
516  ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
517 
518  le = c->changed_extents.Flink;
519 
520  while (le != &c->changed_extents) {
522 
523  if (ce->address == ed2->address) {
524  LIST_ENTRY* le2;
525 
526  le2 = ce->refs.Flink;
527  while (le2 != &ce->refs) {
529 
530  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
531  cer->sdr.offset = mr->new_address;
532  break;
533  }
534 
535  le2 = le2->Flink;
536  }
537 
538  le2 = ce->old_refs.Flink;
539  while (le2 != &ce->old_refs) {
541 
542  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
543  cer->sdr.offset = mr->new_address;
544  break;
545  }
546 
547  le2 = le2->Flink;
548  }
549 
550  break;
551  }
552 
553  le = le->Flink;
554  }
555 
556  ExReleaseResourceLite(&c->changed_extents_lock);
557  }
558  }
559  }
560  }
561  }
562  }
563  }
564  }
565 
566  return STATUS_SUCCESS;
567 }
568 
570  LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
571  LIST_ENTRY tree_writes, *le;
574  uint8_t level, max_level = 0;
575  chunk* newchunk = NULL;
576 
577  InitializeListHead(&tree_writes);
578 
579  le = items->Flink;
580  while (le != items) {
582  LIST_ENTRY* le2;
583  chunk* pc;
584 
585  mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
586  if (!mr->data) {
587  ERR("out of memory\n");
589  }
590 
591  Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
592  c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
593  if (!NT_SUCCESS(Status)) {
594  ERR("read_data returned %08x\n", Status);
595  return Status;
596  }
597 
598  if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
599  mr->system = true;
600 
601  if (data_items && mr->data->level == 0) {
602  le2 = data_items->Flink;
603  while (le2 != data_items) {
605  leaf_node* ln = (leaf_node*)&mr->data[1];
606  uint16_t i;
607 
608  for (i = 0; i < mr->data->num_items; i++) {
609  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
610  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
611 
614 
615  if (ed2->address == dr->address)
616  ed2->address = dr->new_address;
617  }
618  }
619  }
620 
621  le2 = le2->Flink;
622  }
623  }
624 
625  if (mr->data->level > max_level)
626  max_level = mr->data->level;
627 
628  le2 = mr->refs.Flink;
629  while (le2 != &mr->refs) {
631 
632  if (ref->type == TYPE_TREE_BLOCK_REF) {
633  KEY* firstitem;
634  root* r = NULL;
635  LIST_ENTRY* le3;
636  tree* t;
637 
638  firstitem = (KEY*)&mr->data[1];
639 
640  le3 = Vcb->roots.Flink;
641  while (le3 != &Vcb->roots) {
643 
644  if (r2->id == ref->tbr.offset) {
645  r = r2;
646  break;
647  }
648 
649  le3 = le3->Flink;
650  }
651 
652  if (!r) {
653  ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
654  return STATUS_INTERNAL_ERROR;
655  }
656 
657  Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
659  ERR("find_item_to_level returned %08x\n", Status);
660  return Status;
661  }
662 
663  t = tp.tree;
664  while (t && t->header.level < mr->data->level + 1) {
665  t = t->parent;
666  }
667 
668  if (!t)
669  ref->top = true;
670  else {
671  metadata_reloc* mr2;
672 
673  Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
674  if (!NT_SUCCESS(Status)) {
675  ERR("add_metadata_reloc_parent returned %08x\n", Status);
676  return Status;
677  }
678 
679  ref->parent = mr2;
680  }
681  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
682  metadata_reloc* mr2;
683 
684  Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
685  if (!NT_SUCCESS(Status)) {
686  ERR("add_metadata_reloc_parent returned %08x\n", Status);
687  return Status;
688  }
689 
690  ref->parent = mr2;
691  }
692 
693  le2 = le2->Flink;
694  }
695 
696  le = le->Flink;
697  }
698 
699  le = items->Flink;
700  while (le != items) {
702  LIST_ENTRY* le2;
703  uint32_t hash;
704 
705  mr->t = NULL;
706 
707  hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
708 
709  le2 = Vcb->trees_ptrs[hash >> 24];
710 
711  if (le2) {
712  while (le2 != &Vcb->trees_hash) {
713  tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
714 
715  if (t->header.address == mr->address) {
716  mr->t = t;
717  break;
718  } else if (t->hash > hash)
719  break;
720 
721  le2 = le2->Flink;
722  }
723  }
724 
725  le = le->Flink;
726  }
727 
728  for (level = 0; level <= max_level; level++) {
729  le = items->Flink;
730  while (le != items) {
732 
733  if (mr->data->level == level) {
734  bool done = false;
735  LIST_ENTRY* le2;
736  tree_write* tw;
737  uint64_t flags;
738  tree* t3;
739 
740  if (mr->system)
741  flags = Vcb->system_flags;
742  else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
743  flags = Vcb->data_flags;
744  else
745  flags = Vcb->metadata_flags;
746 
747  if (newchunk) {
748  acquire_chunk_lock(newchunk, Vcb);
749 
750  if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
751  newchunk->used += Vcb->superblock.node_size;
752  space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
753  done = true;
754  }
755 
756  release_chunk_lock(newchunk, Vcb);
757  }
758 
759  if (!done) {
760  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
761 
762  le2 = Vcb->chunks.Flink;
763  while (le2 != &Vcb->chunks) {
765 
766  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
767  acquire_chunk_lock(c2, Vcb);
768 
769  if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
771  c2->used += Vcb->superblock.node_size;
772  space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
773  release_chunk_lock(c2, Vcb);
774  newchunk = c2;
775  done = true;
776  break;
777  }
778  }
779 
780  release_chunk_lock(c2, Vcb);
781  }
782 
783  le2 = le2->Flink;
784  }
785 
786  // allocate new chunk if necessary
787  if (!done) {
788  Status = alloc_chunk(Vcb, flags, &newchunk, false);
789 
790  if (!NT_SUCCESS(Status)) {
791  ERR("alloc_chunk returned %08x\n", Status);
792  ExReleaseResourceLite(&Vcb->chunk_lock);
793  goto end;
794  }
795 
796  acquire_chunk_lock(newchunk, Vcb);
797 
798  newchunk->balance_num = Vcb->balance.balance_num;
799 
800  if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
801  release_chunk_lock(newchunk, Vcb);
802  ExReleaseResourceLite(&Vcb->chunk_lock);
803  ERR("could not find address in new chunk\n");
805  goto end;
806  } else {
807  newchunk->used += Vcb->superblock.node_size;
808  space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
809  }
810 
811  release_chunk_lock(newchunk, Vcb);
812  }
813 
814  ExReleaseResourceLite(&Vcb->chunk_lock);
815  }
816 
817  // update parents
818  le2 = mr->refs.Flink;
819  while (le2 != &mr->refs) {
821 
822  if (ref->parent) {
823  uint16_t i;
824  internal_node* in = (internal_node*)&ref->parent->data[1];
825 
826  for (i = 0; i < ref->parent->data->num_items; i++) {
827  if (in[i].address == mr->address) {
828  in[i].address = mr->new_address;
829  break;
830  }
831  }
832 
833  if (ref->parent->t) {
834  LIST_ENTRY* le3;
835 
836  le3 = ref->parent->t->itemlist.Flink;
837  while (le3 != &ref->parent->t->itemlist) {
839 
840  if (!td->inserted && td->treeholder.address == mr->address)
841  td->treeholder.address = mr->new_address;
842 
843  le3 = le3->Flink;
844  }
845  }
846  } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
847  LIST_ENTRY* le3;
848  root* r = NULL;
849 
850  // alter ROOT_ITEM
851 
852  le3 = Vcb->roots.Flink;
853  while (le3 != &Vcb->roots) {
855 
856  if (r2->id == ref->tbr.offset) {
857  r = r2;
858  break;
859  }
860 
861  le3 = le3->Flink;
862  }
863 
864  if (r) {
865  r->treeholder.address = mr->new_address;
866 
867  if (r == Vcb->root_root)
868  Vcb->superblock.root_tree_addr = mr->new_address;
869  else if (r == Vcb->chunk_root)
870  Vcb->superblock.chunk_tree_addr = mr->new_address;
871  else if (r->root_item.block_number == mr->address) {
872  KEY searchkey;
873  ROOT_ITEM* ri;
874 
875  r->root_item.block_number = mr->new_address;
876 
877  searchkey.obj_id = r->id;
878  searchkey.obj_type = TYPE_ROOT_ITEM;
879  searchkey.offset = 0xffffffffffffffff;
880 
881  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
882  if (!NT_SUCCESS(Status)) {
883  ERR("find_item returned %08x\n", Status);
884  goto end;
885  }
886 
887  if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
888  ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
890  goto end;
891  }
892 
894  if (!ri) {
895  ERR("out of memory\n");
897  goto end;
898  }
899 
900  RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
901 
903  if (!NT_SUCCESS(Status)) {
904  ERR("delete_tree_item returned %08x\n", Status);
905  goto end;
906  }
907 
908  Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
909  if (!NT_SUCCESS(Status)) {
910  ERR("insert_tree_item returned %08x\n", Status);
911  goto end;
912  }
913  }
914  }
915  }
916 
917  le2 = le2->Flink;
918  }
919 
920  mr->data->address = mr->new_address;
921 
922  t3 = mr->t;
923 
924  while (t3) {
925  uint8_t h;
926  bool inserted;
927  tree* t4 = NULL;
928 
929  // check if tree loaded more than once
930  if (t3->list_entry.Flink != &Vcb->trees_hash) {
931  tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
932 
933  if (nt->header.address == t3->header.address)
934  t4 = nt;
935  }
936 
937  t3->header.address = mr->new_address;
938 
939  h = t3->hash >> 24;
940 
941  if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
942  if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
943  Vcb->trees_ptrs[h] = NULL;
944  else {
945  tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
946 
947  if (t2->hash >> 24 == h)
948  Vcb->trees_ptrs[h] = &t2->list_entry_hash;
949  else
950  Vcb->trees_ptrs[h] = NULL;
951  }
952  }
953 
955 
956  t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
957  h = t3->hash >> 24;
958 
959  if (!Vcb->trees_ptrs[h]) {
960  uint8_t h2 = h;
961 
962  le2 = Vcb->trees_hash.Flink;
963 
964  if (h2 > 0) {
965  h2--;
966  do {
967  if (Vcb->trees_ptrs[h2]) {
968  le2 = Vcb->trees_ptrs[h2];
969  break;
970  }
971 
972  h2--;
973  } while (h2 > 0);
974  }
975  } else
976  le2 = Vcb->trees_ptrs[h];
977 
978  inserted = false;
979  while (le2 != &Vcb->trees_hash) {
980  tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
981 
982  if (t2->hash >= t3->hash) {
984  inserted = true;
985  break;
986  }
987 
988  le2 = le2->Flink;
989  }
990 
991  if (!inserted)
992  InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
993 
994  if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
995  Vcb->trees_ptrs[h] = &t3->list_entry_hash;
996 
997  if (data_items && level == 0) {
998  le2 = data_items->Flink;
999 
1000  while (le2 != data_items) {
1002  LIST_ENTRY* le3 = t3->itemlist.Flink;
1003 
1004  while (le3 != &t3->itemlist) {
1006 
1007  if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1008  EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1009 
1012 
1013  if (ed2->address == dr->address)
1014  ed2->address = dr->new_address;
1015  }
1016  }
1017 
1018  le3 = le3->Flink;
1019  }
1020 
1021  le2 = le2->Flink;
1022  }
1023  }
1024 
1025  t3 = t4;
1026  }
1027 
1028  *((uint32_t*)mr->data) = ~calc_crc32c(0xffffffff, (uint8_t*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1029 
1031  if (!tw) {
1032  ERR("out of memory\n");
1034  goto end;
1035  }
1036 
1037  tw->address = mr->new_address;
1038  tw->length = Vcb->superblock.node_size;
1039  tw->data = (uint8_t*)mr->data;
1040 
1041  if (IsListEmpty(&tree_writes))
1042  InsertTailList(&tree_writes, &tw->list_entry);
1043  else {
1044  bool inserted = false;
1045 
1046  le2 = tree_writes.Flink;
1047  while (le2 != &tree_writes) {
1049 
1050  if (tw2->address > tw->address) {
1051  InsertHeadList(le2->Blink, &tw->list_entry);
1052  inserted = true;
1053  break;
1054  }
1055 
1056  le2 = le2->Flink;
1057  }
1058 
1059  if (!inserted)
1060  InsertTailList(&tree_writes, &tw->list_entry);
1061  }
1062  }
1063 
1064  le = le->Flink;
1065  }
1066  }
1067 
1068  Status = do_tree_writes(Vcb, &tree_writes, true);
1069  if (!NT_SUCCESS(Status)) {
1070  ERR("do_tree_writes returned %08x\n", Status);
1071  goto end;
1072  }
1073 
1074  le = items->Flink;
1075  while (le != items) {
1077 
1079  if (!NT_SUCCESS(Status)) {
1080  ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1081  goto end;
1082  }
1083 
1084  le = le->Flink;
1085  }
1086 
1088 
1089 end:
1090  while (!IsListEmpty(&tree_writes)) {
1092  ExFreePool(tw);
1093  }
1094 
1095  return Status;
1096 }
1097 
1099  KEY searchkey;
1100  traverse_ptr tp;
1101  NTSTATUS Status;
1102  bool b;
1104  uint32_t loaded = 0;
1105 
1106  TRACE("chunk %I64x\n", c->offset);
1107 
1110 
1111  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1112 
1113  searchkey.obj_id = c->offset;
1114  searchkey.obj_type = TYPE_METADATA_ITEM;
1115  searchkey.offset = 0xffffffffffffffff;
1116 
1117  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1118  if (!NT_SUCCESS(Status)) {
1119  ERR("find_item returned %08x\n", Status);
1120  goto end;
1121  }
1122 
1123  do {
1124  traverse_ptr next_tp;
1125 
1126  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1127  break;
1128 
1129  if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1130  bool tree = false, skinny = false;
1131 
1132  if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1133  tree = true;
1134  skinny = true;
1135  } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1136  tp.item->size >= sizeof(EXTENT_ITEM)) {
1137  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1138 
1140  tree = true;
1141  }
1142 
1143  if (tree) {
1144  Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1145 
1146  if (!NT_SUCCESS(Status)) {
1147  ERR("add_metadata_reloc returned %08x\n", Status);
1148  goto end;
1149  }
1150 
1151  loaded++;
1152 
1153  if (loaded >= 64) // only do 64 at a time
1154  break;
1155  }
1156  }
1157 
1158  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1159 
1160  if (b)
1161  tp = next_tp;
1162  } while (b);
1163 
1164  if (IsListEmpty(&items)) {
1165  *changed = false;
1167  goto end;
1168  } else
1169  *changed = true;
1170 
1172  if (!NT_SUCCESS(Status)) {
1173  ERR("write_metadata_items returned %08x\n", Status);
1174  goto end;
1175  }
1176 
1178 
1179  Vcb->need_write = true;
1180 
1181 end:
1182  if (NT_SUCCESS(Status)) {
1183  Status = do_write(Vcb, NULL);
1184  if (!NT_SUCCESS(Status))
1185  ERR("do_write returned %08x\n", Status);
1186  }
1187 
1188  if (NT_SUCCESS(Status))
1190  else
1192 
1193  free_trees(Vcb);
1194 
1195  ExReleaseResourceLite(&Vcb->tree_lock);
1196 
1197  while (!IsListEmpty(&items)) {
1199 
1200  while (!IsListEmpty(&mr->refs)) {
1202 
1203  ExFreePool(ref);
1204  }
1205 
1206  ExFreePool(mr);
1207  }
1208 
1209  return Status;
1210 }
1211 
1214  NTSTATUS Status;
1215  LIST_ENTRY* le;
1216  KEY searchkey;
1217  traverse_ptr tp;
1218  root* r = NULL;
1219  metadata_reloc* mr;
1220  uint64_t last_tree = 0;
1222 
1223  le = Vcb->roots.Flink;
1224  while (le != &Vcb->roots) {
1226 
1227  if (r2->id == edr->root) {
1228  r = r2;
1229  break;
1230  }
1231 
1232  le = le->Flink;
1233  }
1234 
1235  if (!r) {
1236  ERR("could not find subvol %I64x\n", edr->count);
1237  return STATUS_INTERNAL_ERROR;
1238  }
1239 
1240  searchkey.obj_id = edr->objid;
1241  searchkey.obj_type = TYPE_EXTENT_DATA;
1242  searchkey.offset = 0;
1243 
1244  Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1245  if (!NT_SUCCESS(Status)) {
1246  ERR("find_item returned %08x\n", Status);
1247  return Status;
1248  }
1249 
1250  if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1251  traverse_ptr tp2;
1252 
1253  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1254  tp = tp2;
1255  else {
1256  ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1257  return STATUS_INTERNAL_ERROR;
1258  }
1259  }
1260 
1261  ref = NULL;
1262 
1263  while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1264  traverse_ptr tp2;
1265 
1266  if (tp.item->size >= sizeof(EXTENT_DATA)) {
1268 
1271 
1272  if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1273  if (ref && last_tree == tp.tree->header.address)
1274  ref->edr.count++;
1275  else {
1277  if (!ref) {
1278  ERR("out of memory\n");
1280  }
1281 
1282  ref->type = TYPE_EXTENT_DATA_REF;
1283  RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1284  ref->edr.count = 1;
1285 
1286  Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1287  if (!NT_SUCCESS(Status)) {
1288  ERR("add_metadata_reloc_parent returned %08x\n", Status);
1289  ExFreePool(ref);
1290  return Status;
1291  }
1292 
1293  last_tree = tp.tree->header.address;
1294  ref->parent = mr;
1295 
1296  InsertTailList(&dr->refs, &ref->list_entry);
1297  }
1298  }
1299  }
1300  }
1301 
1302  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1303  tp = tp2;
1304  else
1305  break;
1306  }
1307 
1308  return STATUS_SUCCESS;
1309 }
1310 
1313  NTSTATUS Status;
1314  data_reloc* dr;
1315  EXTENT_ITEM* ei;
1316  uint16_t len;
1317  uint64_t inline_rc;
1318  uint8_t* ptr;
1319 
1321  if (!dr) {
1322  ERR("out of memory\n");
1324  }
1325 
1326  dr->address = tp->item->key.obj_id;
1327  dr->size = tp->item->key.offset;
1328  dr->ei = (EXTENT_ITEM*)tp->item->data;
1329  InitializeListHead(&dr->refs);
1330 
1332  if (!NT_SUCCESS(Status)) {
1333  ERR("delete_tree_item returned %08x\n", Status);
1334  return Status;
1335  }
1336 
1337  if (!c)
1339 
1340  if (c) {
1342 
1343  c->used -= tp->item->key.offset;
1344 
1346 
1348  }
1349 
1350  ei = (EXTENT_ITEM*)tp->item->data;
1351  inline_rc = 0;
1352 
1353  len = tp->item->size - sizeof(EXTENT_ITEM);
1354  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1355 
1356  while (len > 0) {
1357  uint8_t secttype = *ptr;
1358  uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1359 
1360  len--;
1361 
1362  if (sectlen > len) {
1363  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1364  return STATUS_INTERNAL_ERROR;
1365  }
1366 
1367  if (sectlen == 0) {
1368  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1369  return STATUS_INTERNAL_ERROR;
1370  }
1371 
1372  if (secttype == TYPE_EXTENT_DATA_REF) {
1373  EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1374 
1375  inline_rc += edr->count;
1376 
1377  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1378  if (!NT_SUCCESS(Status)) {
1379  ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1380  return Status;
1381  }
1382  } else if (secttype == TYPE_SHARED_DATA_REF) {
1383  metadata_reloc* mr;
1385 
1387  if (!ref) {
1388  ERR("out of memory\n");
1390  }
1391 
1392  ref->type = TYPE_SHARED_DATA_REF;
1393  RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1394  inline_rc += ref->sdr.count;
1395 
1396  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1397  if (!NT_SUCCESS(Status)) {
1398  ERR("add_metadata_reloc_parent returned %08x\n", Status);
1399  ExFreePool(ref);
1400  return Status;
1401  }
1402 
1403  ref->parent = mr;
1404 
1405  InsertTailList(&dr->refs, &ref->list_entry);
1406  } else {
1407  ERR("unexpected tree type %x\n", secttype);
1408  return STATUS_INTERNAL_ERROR;
1409  }
1410 
1411 
1412  len -= sectlen;
1413  ptr += sizeof(uint8_t) + sectlen;
1414  }
1415 
1416  if (inline_rc < ei->refcount) { // look for non-inline entries
1417  traverse_ptr tp2 = *tp, next_tp;
1418 
1419  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1420  tp2 = next_tp;
1421 
1422  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1423  if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1424  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1425  if (!NT_SUCCESS(Status)) {
1426  ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1427  return Status;
1428  }
1429 
1430  Status = delete_tree_item(Vcb, &tp2);
1431  if (!NT_SUCCESS(Status)) {
1432  ERR("delete_tree_item returned %08x\n", Status);
1433  return Status;
1434  }
1435  } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1436  metadata_reloc* mr;
1438 
1440  if (!ref) {
1441  ERR("out of memory\n");
1443  }
1444 
1445  ref->type = TYPE_SHARED_DATA_REF;
1446  ref->sdr.offset = tp2.item->key.offset;
1447  ref->sdr.count = *((uint32_t*)tp2.item->data);
1448 
1449  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1450  if (!NT_SUCCESS(Status)) {
1451  ERR("add_metadata_reloc_parent returned %08x\n", Status);
1452  ExFreePool(ref);
1453  return Status;
1454  }
1455 
1456  ref->parent = mr;
1457  InsertTailList(&dr->refs, &ref->list_entry);
1458 
1459  Status = delete_tree_item(Vcb, &tp2);
1460  if (!NT_SUCCESS(Status)) {
1461  ERR("delete_tree_item returned %08x\n", Status);
1462  return Status;
1463  }
1464  }
1465  } else
1466  break;
1467  }
1468  }
1469 
1471 
1472  return STATUS_SUCCESS;
1473 }
1474 
1476  LIST_ENTRY newlist, *le;
1477 
1478  if (IsListEmpty(&dr->refs))
1479  return;
1480 
1481  // insertion sort
1482 
1483  InitializeListHead(&newlist);
1484 
1485  while (!IsListEmpty(&dr->refs)) {
1487  bool inserted = false;
1488 
1489  if (ref->type == TYPE_EXTENT_DATA_REF)
1490  ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1491  else if (ref->type == TYPE_SHARED_DATA_REF)
1492  ref->hash = ref->parent->new_address;
1493 
1494  le = newlist.Flink;
1495  while (le != &newlist) {
1497 
1498  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1500  inserted = true;
1501  break;
1502  }
1503 
1504  le = le->Flink;
1505  }
1506 
1507  if (!inserted)
1508  InsertTailList(&newlist, &ref->list_entry);
1509  }
1510 
1511  le = newlist.Flink;
1512  while (le != &newlist) {
1514 
1515  if (le->Flink != &newlist) {
1517 
1518  if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1519  ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1520  RemoveEntryList(&ref2->list_entry);
1521  ref->edr.count += ref2->edr.count;
1522  ExFreePool(ref2);
1523  continue;
1524  }
1525  }
1526 
1527  le = le->Flink;
1528  }
1529 
1530  newlist.Flink->Blink = &dr->refs;
1531  newlist.Blink->Flink = &dr->refs;
1532  dr->refs.Flink = newlist.Flink;
1533  dr->refs.Blink = newlist.Blink;
1534 }
1535 
1537  NTSTATUS Status;
1538  LIST_ENTRY* le;
1539  uint64_t rc = 0;
1540  uint16_t inline_len;
1541  bool all_inline = true;
1542  data_reloc_ref* first_noninline = NULL;
1543  EXTENT_ITEM* ei;
1544  uint8_t* ptr;
1545 
1546  inline_len = sizeof(EXTENT_ITEM);
1547 
1549 
1550  le = dr->refs.Flink;
1551  while (le != &dr->refs) {
1553  uint16_t extlen = 0;
1554 
1555  if (ref->type == TYPE_EXTENT_DATA_REF) {
1556  extlen += sizeof(EXTENT_DATA_REF);
1557  rc += ref->edr.count;
1558  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1559  extlen += sizeof(SHARED_DATA_REF);
1560  rc++;
1561  }
1562 
1563  if (all_inline) {
1564  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1565  all_inline = false;
1566  first_noninline = ref;
1567  } else
1568  inline_len += extlen + 1;
1569  }
1570 
1571  le = le->Flink;
1572  }
1573 
1574  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1575  if (!ei) {
1576  ERR("out of memory\n");
1578  }
1579 
1580  ei->refcount = rc;
1581  ei->generation = dr->ei->generation;
1582  ei->flags = dr->ei->flags;
1583  ptr = (uint8_t*)&ei[1];
1584 
1585  le = dr->refs.Flink;
1586  while (le != &dr->refs) {
1588 
1589  if (ref == first_noninline)
1590  break;
1591 
1592  *ptr = ref->type;
1593  ptr++;
1594 
1595  if (ref->type == TYPE_EXTENT_DATA_REF) {
1597 
1598  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1599 
1600  ptr += sizeof(EXTENT_DATA_REF);
1601  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1603 
1604  sdr->offset = ref->parent->new_address;
1605  sdr->count = ref->sdr.count;
1606 
1607  ptr += sizeof(SHARED_DATA_REF);
1608  }
1609 
1610  le = le->Flink;
1611  }
1612 
1613  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1614  if (!NT_SUCCESS(Status)) {
1615  ERR("insert_tree_item returned %08x\n", Status);
1616  return Status;
1617  }
1618 
1619  if (!all_inline) {
1620  le = &first_noninline->list_entry;
1621 
1622  while (le != &dr->refs) {
1624 
1625  if (ref->type == TYPE_EXTENT_DATA_REF) {
1626  EXTENT_DATA_REF* edr;
1627 
1629  if (!edr) {
1630  ERR("out of memory\n");
1632  }
1633 
1634  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1635 
1636  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1637  if (!NT_SUCCESS(Status)) {
1638  ERR("insert_tree_item returned %08x\n", Status);
1639  return Status;
1640  }
1641  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1642  uint32_t* sdr;
1643 
1645  if (!sdr) {
1646  ERR("out of memory\n");
1648  }
1649 
1650  *sdr = ref->sdr.count;
1651 
1652  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1653  if (!NT_SUCCESS(Status)) {
1654  ERR("insert_tree_item returned %08x\n", Status);
1655  return Status;
1656  }
1657  }
1658 
1659  le = le->Flink;
1660  }
1661  }
1662 
1663  return STATUS_SUCCESS;
1664 }
1665 
1667  KEY searchkey;
1668  traverse_ptr tp;
1669  NTSTATUS Status;
1670  bool b;
1671  LIST_ENTRY items, metadata_items, rollback, *le;
1672  uint64_t loaded = 0, num_loaded = 0;
1673  chunk* newchunk = NULL;
1674  uint8_t* data = NULL;
1675 
1676  TRACE("chunk %I64x\n", c->offset);
1677 
1680  InitializeListHead(&metadata_items);
1681 
1682  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1683 
1684  searchkey.obj_id = c->offset;
1685  searchkey.obj_type = TYPE_EXTENT_ITEM;
1686  searchkey.offset = 0xffffffffffffffff;
1687 
1688  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1689  if (!NT_SUCCESS(Status)) {
1690  ERR("find_item returned %08x\n", Status);
1691  goto end;
1692  }
1693 
1694  do {
1695  traverse_ptr next_tp;
1696 
1697  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1698  break;
1699 
1700  if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1701  bool tree = false;
1702 
1703  if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1704  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1705 
1707  tree = true;
1708  }
1709 
1710  if (!tree) {
1711  Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1712 
1713  if (!NT_SUCCESS(Status)) {
1714  ERR("add_data_reloc returned %08x\n", Status);
1715  goto end;
1716  }
1717 
1718  loaded += tp.item->key.offset;
1719  num_loaded++;
1720 
1721  if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1722  break;
1723  }
1724  }
1725 
1726  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1727 
1728  if (b)
1729  tp = next_tp;
1730  } while (b);
1731 
1732  if (IsListEmpty(&items)) {
1733  *changed = false;
1735  goto end;
1736  } else
1737  *changed = true;
1738 
1740  if (!data) {
1741  ERR("out of memory\n");
1743  goto end;
1744  }
1745 
1746  le = items.Flink;
1747  while (le != &items) {
1749  bool done = false;
1750  LIST_ENTRY* le2;
1751  uint32_t* csum;
1752  RTL_BITMAP bmp;
1753  ULONG* bmparr;
1754  ULONG bmplen, runlength, index, lastoff;
1755 
1756  if (newchunk) {
1757  acquire_chunk_lock(newchunk, Vcb);
1758 
1759  if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1760  newchunk->used += dr->size;
1761  space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1762  done = true;
1763  }
1764 
1765  release_chunk_lock(newchunk, Vcb);
1766  }
1767 
1768  if (!done) {
1769  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1770 
1771  le2 = Vcb->chunks.Flink;
1772  while (le2 != &Vcb->chunks) {
1773  chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1774 
1775  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1776  acquire_chunk_lock(c2, Vcb);
1777 
1778  if ((c2->chunk_item->size - c2->used) >= dr->size) {
1779  if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1780  c2->used += dr->size;
1781  space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1782  release_chunk_lock(c2, Vcb);
1783  newchunk = c2;
1784  done = true;
1785  break;
1786  }
1787  }
1788 
1789  release_chunk_lock(c2, Vcb);
1790  }
1791 
1792  le2 = le2->Flink;
1793  }
1794 
1795  // allocate new chunk if necessary
1796  if (!done) {
1797  Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1798 
1799  if (!NT_SUCCESS(Status)) {
1800  ERR("alloc_chunk returned %08x\n", Status);
1801  ExReleaseResourceLite(&Vcb->chunk_lock);
1802  goto end;
1803  }
1804 
1805  acquire_chunk_lock(newchunk, Vcb);
1806 
1807  newchunk->balance_num = Vcb->balance.balance_num;
1808 
1809  if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1810  release_chunk_lock(newchunk, Vcb);
1811  ExReleaseResourceLite(&Vcb->chunk_lock);
1812  ERR("could not find address in new chunk\n");
1814  goto end;
1815  } else {
1816  newchunk->used += dr->size;
1817  space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1818  }
1819 
1820  release_chunk_lock(newchunk, Vcb);
1821  }
1822 
1823  ExReleaseResourceLite(&Vcb->chunk_lock);
1824  }
1825 
1826  dr->newchunk = newchunk;
1827 
1828  bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1829 
1830  bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1831  if (!bmparr) {
1832  ERR("out of memory\n");
1834  goto end;
1835  }
1836 
1837  csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(uint32_t) / Vcb->superblock.sector_size), ALLOC_TAG);
1838  if (!csum) {
1839  ERR("out of memory\n");
1840  ExFreePool(bmparr);
1842  goto end;
1843  }
1844 
1845  RtlInitializeBitMap(&bmp, bmparr, bmplen);
1846  RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1847 
1848  searchkey.obj_id = EXTENT_CSUM_ID;
1849  searchkey.obj_type = TYPE_EXTENT_CSUM;
1850  searchkey.offset = dr->address;
1851 
1852  Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1853  if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1854  ERR("find_item returned %08x\n", Status);
1855  ExFreePool(csum);
1856  ExFreePool(bmparr);
1857  goto end;
1858  }
1859 
1860  if (Status != STATUS_NOT_FOUND) {
1861  do {
1862  traverse_ptr next_tp;
1863 
1864  if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1865  if (tp.item->key.offset >= dr->address + dr->size)
1866  break;
1867  else if (tp.item->size >= sizeof(uint32_t) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= dr->address) {
1868  uint64_t cs = max(dr->address, tp.item->key.offset);
1869  uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)));
1870 
1871  RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1872  tp.item->data + ((cs - tp.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size),
1873  (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size));
1874 
1875  RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1876 
1877  if (ce == dr->address + dr->size)
1878  break;
1879  }
1880  }
1881 
1882  if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1883  tp = next_tp;
1884  else
1885  break;
1886  } while (true);
1887  }
1888 
1889  lastoff = 0;
1890  runlength = RtlFindFirstRunClear(&bmp, &index);
1891 
1892  while (runlength != 0) {
1893  if (index >= bmplen)
1894  break;
1895 
1896  if (index + runlength >= bmplen) {
1897  runlength = bmplen - index;
1898 
1899  if (runlength == 0)
1900  break;
1901  }
1902 
1903  if (index > lastoff) {
1904  ULONG off = lastoff;
1905  ULONG size = index - lastoff;
1906 
1907  // handle no csum run
1908  do {
1909  ULONG rl;
1910 
1911  if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1912  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1913  else
1914  rl = size;
1915 
1916  Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1917  c, NULL, NULL, 0, false, NormalPagePriority);
1918  if (!NT_SUCCESS(Status)) {
1919  ERR("read_data returned %08x\n", Status);
1920  ExFreePool(csum);
1921  ExFreePool(bmparr);
1922  goto end;
1923  }
1924 
1925  Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1926  NULL, newchunk, false, 0, NormalPagePriority);
1927  if (!NT_SUCCESS(Status)) {
1928  ERR("write_data_complete returned %08x\n", Status);
1929  ExFreePool(csum);
1930  ExFreePool(bmparr);
1931  goto end;
1932  }
1933 
1934  size -= rl;
1935  off += rl;
1936  } while (size > 0);
1937  }
1938 
1939  add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1940  add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1941 
1942  // handle csum run
1943  do {
1944  ULONG rl;
1945 
1946  if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1947  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1948  else
1949  rl = runlength;
1950 
1951  Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], false, data,
1952  c, NULL, NULL, 0, false, NormalPagePriority);
1953  if (!NT_SUCCESS(Status)) {
1954  ERR("read_data returned %08x\n", Status);
1955  ExFreePool(csum);
1956  ExFreePool(bmparr);
1957  goto end;
1958  }
1959 
1960  Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1961  NULL, newchunk, false, 0, NormalPagePriority);
1962  if (!NT_SUCCESS(Status)) {
1963  ERR("write_data_complete returned %08x\n", Status);
1964  ExFreePool(csum);
1965  ExFreePool(bmparr);
1966  goto end;
1967  }
1968 
1969  runlength -= rl;
1970  index += rl;
1971  } while (runlength > 0);
1972 
1973  lastoff = index;
1974  runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1975  }
1976 
1977  ExFreePool(csum);
1978  ExFreePool(bmparr);
1979 
1980  // handle final nocsum run
1981  if (lastoff < dr->size / Vcb->superblock.sector_size) {
1982  ULONG off = lastoff;
1983  ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1984 
1985  do {
1986  ULONG rl;
1987 
1988  if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1989  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1990  else
1991  rl = size;
1992 
1993  Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1994  c, NULL, NULL, 0, false, NormalPagePriority);
1995  if (!NT_SUCCESS(Status)) {
1996  ERR("read_data returned %08x\n", Status);
1997  goto end;
1998  }
1999 
2000  Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2001  NULL, newchunk, false, 0, NormalPagePriority);
2002  if (!NT_SUCCESS(Status)) {
2003  ERR("write_data_complete returned %08x\n", Status);
2004  goto end;
2005  }
2006 
2007  size -= rl;
2008  off += rl;
2009  } while (size > 0);
2010  }
2011 
2012  le = le->Flink;
2013  }
2014 
2015  ExFreePool(data);
2016  data = NULL;
2017 
2018  Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2019  if (!NT_SUCCESS(Status)) {
2020  ERR("write_metadata_items returned %08x\n", Status);
2021  goto end;
2022  }
2023 
2024  le = items.Flink;
2025  while (le != &items) {
2027 
2029  if (!NT_SUCCESS(Status)) {
2030  ERR("add_data_reloc_extent_item returned %08x\n", Status);
2031  goto end;
2032  }
2033 
2034  le = le->Flink;
2035  }
2036 
2037  le = c->changed_extents.Flink;
2038  while (le != &c->changed_extents) {
2039  LIST_ENTRY *le2, *le3;
2041 
2042  le3 = le->Flink;
2043 
2044  le2 = items.Flink;
2045  while (le2 != &items) {
2047 
2048  if (ce->address == dr->address) {
2049  ce->address = dr->new_address;
2052  break;
2053  }
2054 
2055  le2 = le2->Flink;
2056  }
2057 
2058  le = le3;
2059  }
2060 
2062 
2063  Vcb->need_write = true;
2064 
2065 end:
2066  if (NT_SUCCESS(Status)) {
2067  // update extents in cache inodes before we flush
2068  le = Vcb->chunks.Flink;
2069  while (le != &Vcb->chunks) {
2071 
2072  if (c2->cache) {
2073  LIST_ENTRY* le2;
2074 
2075  ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2076 
2077  le2 = c2->cache->extents.Flink;
2078  while (le2 != &c2->cache->extents) {
2080 
2081  if (!ext->ignore) {
2082  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2083  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2084 
2085  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2086  LIST_ENTRY* le3 = items.Flink;
2087  while (le3 != &items) {
2089 
2090  if (ed2->address == dr->address) {
2091  ed2->address = dr->new_address;
2092  break;
2093  }
2094 
2095  le3 = le3->Flink;
2096  }
2097  }
2098  }
2099  }
2100 
2101  le2 = le2->Flink;
2102  }
2103 
2104  ExReleaseResourceLite(c2->cache->Header.Resource);
2105  }
2106 
2107  le = le->Flink;
2108  }
2109 
2110  Status = do_write(Vcb, NULL);
2111  if (!NT_SUCCESS(Status))
2112  ERR("do_write returned %08x\n", Status);
2113  }
2114 
2115  if (NT_SUCCESS(Status)) {
2117 
2118  // update open FCBs
2119  // FIXME - speed this up(?)
2120 
2121  le = Vcb->all_fcbs.Flink;
2122  while (le != &Vcb->all_fcbs) {
2123  struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2124  LIST_ENTRY* le2;
2125 
2126  ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2127 
2128  le2 = fcb->extents.Flink;
2129  while (le2 != &fcb->extents) {
2131 
2132  if (!ext->ignore) {
2133  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2134  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2135 
2136  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2137  LIST_ENTRY* le3 = items.Flink;
2138  while (le3 != &items) {
2140 
2141  if (ed2->address == dr->address) {
2142  ed2->address = dr->new_address;
2143  break;
2144  }
2145 
2146  le3 = le3->Flink;
2147  }
2148  }
2149  }
2150  }
2151 
2152  le2 = le2->Flink;
2153  }
2154 
2155  ExReleaseResourceLite(fcb->Header.Resource);
2156 
2157  le = le->Flink;
2158  }
2159  } else
2161 
2162  free_trees(Vcb);
2163 
2164  ExReleaseResourceLite(&Vcb->tree_lock);
2165 
2166  if (data)
2167  ExFreePool(data);
2168 
2169  while (!IsListEmpty(&items)) {
2171 
2172  while (!IsListEmpty(&dr->refs)) {
2174 
2175  ExFreePool(ref);
2176  }
2177 
2178  ExFreePool(dr);
2179  }
2180 
2181  while (!IsListEmpty(&metadata_items)) {
2183 
2184  while (!IsListEmpty(&mr->refs)) {
2186 
2187  ExFreePool(ref);
2188  }
2189 
2190  ExFreePool(mr);
2191  }
2192 
2193  return Status;
2194 }
2195 
2196 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2197  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2198  return BLOCK_FLAG_RAID0;
2199  else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2200  return BLOCK_FLAG_RAID1;
2201  else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2202  return BLOCK_FLAG_DUPLICATE;
2203  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2204  return BLOCK_FLAG_RAID10;
2205  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2206  return BLOCK_FLAG_RAID5;
2207  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2208  return BLOCK_FLAG_RAID6;
2209  else
2210  return BLOCK_FLAG_SINGLE;
2211 }
2212 
2214  btrfs_balance_opts* opts;
2215 
2216  opts = &Vcb->balance.opts[sort];
2217 
2218  if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2219  return false;
2220 
2221  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2223 
2224  if (!(type & opts->profiles))
2225  return false;
2226  }
2227 
2228  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2229  uint16_t i;
2230  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2231  bool b = false;
2232 
2233  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2234  if (cis[i].dev_id == opts->devid) {
2235  b = true;
2236  break;
2237  }
2238  }
2239 
2240  if (!b)
2241  return false;
2242  }
2243 
2244  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2245  uint16_t i, factor;
2246  uint64_t physsize;
2247  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2248  bool b = false;
2249 
2250  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2251  factor = c->chunk_item->num_stripes;
2252  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2253  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2254  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2255  factor = c->chunk_item->num_stripes - 1;
2256  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2257  factor = c->chunk_item->num_stripes - 2;
2258  else // SINGLE, DUPLICATE, RAID1
2259  factor = 1;
2260 
2261  physsize = c->chunk_item->size / factor;
2262 
2263  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2264  if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2265  (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2266  b = true;
2267  break;
2268  }
2269  }
2270 
2271  if (!b)
2272  return false;
2273  }
2274 
2275  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2276  if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2277  return false;
2278  }
2279 
2280  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2281  if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2282  return false;
2283  }
2284 
2285  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2286  uint64_t usage = c->used * 100 / c->chunk_item->size;
2287 
2288  // usage == 0 should mean completely empty, not just that usage rounds to 0%
2289  if (c->used > 0 && usage == 0)
2290  usage = 1;
2291 
2292  if (usage < opts->usage_start || usage > opts->usage_end)
2293  return false;
2294  }
2295 
2298 
2299  if (type == opts->convert)
2300  return false;
2301  }
2302 
2303  return true;
2304 }
2305 
2307  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2308  args->profiles = opts->profiles;
2310  }
2311 
2312  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2313  if (args->usage_start == 0) {
2315  args->usage_start = opts->usage_start;
2316  args->usage_end = opts->usage_end;
2317  } else {
2318  args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2319  args->usage = opts->usage_end;
2320  }
2321  }
2322 
2323  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2324  args->devid = opts->devid;
2325  args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2326  }
2327 
2328  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2329  args->drange_start = opts->drange_start;
2330  args->drange_end = opts->drange_end;
2331  args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2332  }
2333 
2334  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2335  args->vrange_start = opts->vrange_start;
2336  args->vrange_end = opts->vrange_end;
2337  args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2338  }
2339 
2340  if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2341  args->convert = opts->convert;
2342  args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2343 
2344  if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2345  args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2346  }
2347 
2348  if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2349  if (args->limit_start == 0) {
2351  args->limit_start = (uint32_t)opts->limit_start;
2352  args->limit_end = (uint32_t)opts->limit_end;
2353  } else {
2354  args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2355  args->limit = opts->limit_end;
2356  }
2357  }
2358 
2359  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2360  args->stripes_start = opts->stripes_start;
2361  args->stripes_end = opts->stripes_end;
2363  }
2364 }
2365 
2367  KEY searchkey;
2368  traverse_ptr tp;
2369  NTSTATUS Status;
2370  BALANCE_ITEM* bi;
2371 
2372  searchkey.obj_id = BALANCE_ITEM_ID;
2373  searchkey.obj_type = TYPE_TEMP_ITEM;
2374  searchkey.offset = 0;
2375 
2376  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2377 
2378  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2379  if (!NT_SUCCESS(Status)) {
2380  ERR("find_item returned %08x\n", Status);
2381  goto end;
2382  }
2383 
2384  if (!keycmp(tp.item->key, searchkey)) {
2386  if (!NT_SUCCESS(Status)) {
2387  ERR("delete_tree_item returned %08x\n", Status);
2388  goto end;
2389  }
2390  }
2391 
2393  if (!bi) {
2394  ERR("out of memory\n");
2396  goto end;
2397  }
2398 
2399  RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2400 
2401  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2402  bi->flags |= BALANCE_FLAGS_DATA;
2403  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2404  }
2405 
2406  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2408  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2409  }
2410 
2411  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2412  bi->flags |= BALANCE_FLAGS_SYSTEM;
2413  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2414  }
2415 
2416  Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2417  if (!NT_SUCCESS(Status)) {
2418  ERR("insert_tree_item returned %08x\n", Status);
2419  ExFreePool(bi);
2420  goto end;
2421  }
2422 
2424 
2425 end:
2426  if (NT_SUCCESS(Status)) {
2427  Status = do_write(Vcb, NULL);
2428  if (!NT_SUCCESS(Status))
2429  ERR("do_write returned %08x\n", Status);
2430  }
2431 
2432  free_trees(Vcb);
2433 
2434  ExReleaseResourceLite(&Vcb->tree_lock);
2435 
2436  return Status;
2437 }
2438 
2440  KEY searchkey;
2441  traverse_ptr tp;
2442  NTSTATUS Status;
2443 
2444  searchkey.obj_id = BALANCE_ITEM_ID;
2445  searchkey.obj_type = TYPE_TEMP_ITEM;
2446  searchkey.offset = 0;
2447 
2448  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2449 
2450  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2451  if (!NT_SUCCESS(Status)) {
2452  ERR("find_item returned %08x\n", Status);
2453  goto end;
2454  }
2455 
2456  if (!keycmp(tp.item->key, searchkey)) {
2458  if (!NT_SUCCESS(Status)) {
2459  ERR("delete_tree_item returned %08x\n", Status);
2460  goto end;
2461  }
2462 
2463  Status = do_write(Vcb, NULL);
2464  if (!NT_SUCCESS(Status)) {
2465  ERR("do_write returned %08x\n", Status);
2466  goto end;
2467  }
2468 
2469  free_trees(Vcb);
2470  }
2471 
2473 
2474 end:
2475  ExReleaseResourceLite(&Vcb->tree_lock);
2476 
2477  return Status;
2478 }
2479 
2482 
2483  if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2485  opts->profiles = args->profiles;
2486  }
2487 
2488  if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2490 
2491  opts->usage_start = 0;
2492  opts->usage_end = (uint8_t)args->usage;
2493  } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2495 
2496  opts->usage_start = (uint8_t)args->usage_start;
2497  opts->usage_end = (uint8_t)args->usage_end;
2498  }
2499 
2500  if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2502  opts->devid = args->devid;
2503  }
2504 
2505  if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2507  opts->drange_start = args->drange_start;
2508  opts->drange_end = args->drange_end;
2509  }
2510 
2511  if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2513  opts->vrange_start = args->vrange_start;
2514  opts->vrange_end = args->vrange_end;
2515  }
2516 
2517  if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2519 
2520  opts->limit_start = 0;
2521  opts->limit_end = args->limit;
2522  } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2524 
2525  opts->limit_start = args->limit_start;
2526  opts->limit_end = args->limit_end;
2527  }
2528 
2529  if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2531 
2532  opts->stripes_start = (uint16_t)args->stripes_start;
2533  opts->stripes_end = (uint16_t)args->stripes_end;
2534  }
2535 
2536  if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2538  opts->convert = args->convert;
2539 
2540  if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2541  opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2542  }
2543 }
2544 
2546  NTSTATUS Status;
2547  superblock* sb;
2548  int i = 0;
2549 
2551  if (!sb) {
2552  ERR("out of memory\n");
2554  }
2555 
2556  RtlZeroMemory(sb, sizeof(superblock));
2557 
2558  while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2559  Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2560 
2561  if (!NT_SUCCESS(Status)) {
2562  ExFreePool(sb);
2563  return Status;
2564  }
2565 
2566  i++;
2567  }
2568 
2569  ExFreePool(sb);
2570 
2571  return STATUS_SUCCESS;
2572 }
2573 
2575  KEY searchkey;
2576  traverse_ptr tp;
2577  NTSTATUS Status;
2578  LIST_ENTRY* le;
2580 
2581  if (Vcb->need_write) {
2582  Status = do_write(Vcb, NULL);
2583 
2584  if (!NT_SUCCESS(Status))
2585  ERR("do_write returned %08x\n", Status);
2586  } else
2588 
2589  free_trees(Vcb);
2590 
2591  if (!NT_SUCCESS(Status))
2592  return Status;
2593 
2594  // remove entry in chunk tree
2595 
2596  searchkey.obj_id = 1;
2597  searchkey.obj_type = TYPE_DEV_ITEM;
2598  searchkey.offset = dev->devitem.dev_id;
2599 
2600  Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2601  if (!NT_SUCCESS(Status)) {
2602  ERR("find_item returned %08x\n", Status);
2603  return Status;
2604  }
2605 
2606  if (!keycmp(searchkey, tp.item->key)) {
2608 
2609  if (!NT_SUCCESS(Status)) {
2610  ERR("delete_tree_item returned %08x\n", Status);
2611  return Status;
2612  }
2613  }
2614 
2615  // remove stats entry in device tree
2616 
2617  searchkey.obj_id = 0;
2618  searchkey.obj_type = TYPE_DEV_STATS;
2619  searchkey.offset = dev->devitem.dev_id;
2620 
2621  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2622  if (!NT_SUCCESS(Status)) {
2623  ERR("find_item returned %08x\n", Status);
2624  return Status;
2625  }
2626 
2627  if (!keycmp(searchkey, tp.item->key)) {
2629 
2630  if (!NT_SUCCESS(Status)) {
2631  ERR("delete_tree_item returned %08x\n", Status);
2632  return Status;
2633  }
2634  }
2635 
2636  // update superblock
2637 
2638  Vcb->superblock.num_devices--;
2639  Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2640  Vcb->devices_loaded--;
2641 
2642  RemoveEntryList(&dev->list_entry);
2643 
2644  // flush
2645 
2646  Status = do_write(Vcb, NULL);
2647  if (!NT_SUCCESS(Status))
2648  ERR("do_write returned %08x\n", Status);
2649 
2650  free_trees(Vcb);
2651 
2652  if (!NT_SUCCESS(Status))
2653  return Status;
2654 
2655  if (!dev->readonly && dev->devobj) {
2657  if (!NT_SUCCESS(Status))
2658  WARN("remove_superblocks returned %08x\n", Status);
2659  }
2660 
2661  // remove entry in volume list
2662 
2663  vde = Vcb->vde;
2664 
2665  if (dev->devobj) {
2666  pdo_device_extension* pdode = vde->pdode;
2667 
2669 
2670  le = pdode->children.Flink;
2671  while (le != &pdode->children) {
2673 
2674  if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2676  PDEVICE_OBJECT mountmgr;
2677  UNICODE_STRING mmdevpath;
2678 
2679  pdode->children_loaded--;
2680 
2681  if (vc->had_drive_letter) { // re-add entry to mountmgr
2683  Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2684  if (!NT_SUCCESS(Status))
2685  ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2686  else {
2687  MOUNTDEV_NAME mdn;
2688 
2689  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2691  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2692  else {
2693  MOUNTDEV_NAME* mdn2;
2694  ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2695 
2696  mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2697  if (!mdn2)
2698  ERR("out of memory\n");
2699  else {
2700  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2701  if (!NT_SUCCESS(Status))
2702  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2703  else {
2705 
2706  name.Buffer = mdn2->Name;
2707  name.Length = name.MaximumLength = mdn2->NameLength;
2708 
2709  Status = mountmgr_add_drive_letter(mountmgr, &name);
2710  if (!NT_SUCCESS(Status))
2711  WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2712  }
2713 
2714  ExFreePool(mdn2);
2715  }
2716  }
2717 
2719  }
2720  }
2721 
2722  ExFreePool(vc->pnp_name.Buffer);
2724  ExFreePool(vc);
2725 
2727 
2728  break;
2729  }
2730 
2731  le = le->Flink;
2732  }
2733 
2734  if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2735  vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2736 
2737  le = pdode->children.Flink;
2738  while (le != &pdode->children) {
2740 
2741  if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2742  vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2743  break;
2744  }
2745 
2746  le = le->Flink;
2747  }
2748  }
2749 
2750  pdode->num_children = Vcb->superblock.num_devices;
2751 
2753 
2754  // free dev
2755 
2756  if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2758  }
2759 
2760  while (!IsListEmpty(&dev->space)) {
2761  LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2763 
2764  ExFreePool(s);
2765  }
2766 
2767  ExFreePool(dev);
2768 
2769  if (Vcb->trim) {
2770  Vcb->trim = false;
2771 
2772  le = Vcb->devices.Flink;
2773  while (le != &Vcb->devices) {
2774  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2775 
2776  if (dev2->trim) {
2777  Vcb->trim = true;
2778  break;
2779  }
2780 
2781  le = le->Flink;
2782  }
2783  }
2784 
2786 
2787  return STATUS_SUCCESS;
2788 }
2789 
2792  DEVICE_DATA_SET_RANGE* ranges;
2793  ULONG datalen, i;
2794  KEY searchkey;
2795  traverse_ptr tp;
2796  NTSTATUS Status;
2797  bool b;
2798  uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2799  LIST_ENTRY* le;
2800 
2801  dev->num_trim_entries = 0;
2802 
2803  searchkey.obj_id = dev->devitem.dev_id;
2804  searchkey.obj_type = TYPE_DEV_EXTENT;
2805  searchkey.offset = 0;
2806 
2807  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2808  if (!NT_SUCCESS(Status)) {
2809  ERR("find_item returned %08x\n", Status);
2810  return;
2811  }
2812 
2813  do {
2814  traverse_ptr next_tp;
2815 
2816  if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2817  if (tp.item->size >= sizeof(DEV_EXTENT)) {
2818  DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2819 
2820  if (tp.item->key.offset > lastoff)
2821  add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2822 
2823  lastoff = tp.item->key.offset + de->length;
2824  } else {
2825  ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2826  return;
2827  }
2828  }
2829 
2830  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2831 
2832  if (b) {
2833  tp = next_tp;
2834  if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2835  break;
2836  }
2837  } while (b);
2838 
2839  if (lastoff < dev->devitem.num_bytes)
2840  add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2841 
2842  if (dev->num_trim_entries == 0)
2843  return;
2844 
2845  datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2846 
2848  if (!dmdsa) {
2849  ERR("out of memory\n");
2850  goto end;
2851  }
2852 
2853  dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2854  dmdsa->Action = DeviceDsmAction_Trim;
2856  dmdsa->ParameterBlockOffset = 0;
2857  dmdsa->ParameterBlockLength = 0;
2859  dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2860 
2861  ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2862 
2863  i = 0;
2864  le = dev->trim_list.Flink;
2865  while (le != &dev->trim_list) {
2867 
2868  ranges[i].StartingOffset = s->address;
2869  ranges[i].LengthInBytes = s->size;
2870  i++;
2871 
2872  le = le->Flink;
2873  }
2874 
2876  if (!NT_SUCCESS(Status))
2877  WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2878 
2879  ExFreePool(dmdsa);
2880 
2881 end:
2882  while (!IsListEmpty(&dev->trim_list)) {
2884  ExFreePool(s);
2885  }
2886 
2887  dev->num_trim_entries = 0;
2888 }
2889 
2891  NTSTATUS Status;
2892  bool changed;
2893  LIST_ENTRY* le;
2894  chunk* rc;
2895 
2896  // FIXME - allow with metadata chunks?
2897 
2898  while (true) {
2899  rc = NULL;
2900 
2901  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2902 
2903  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2904 
2905  // choose the least-used chunk we haven't looked at yet
2906  le = Vcb->chunks.Flink;
2907  while (le != &Vcb->chunks) {
2909 
2910  // FIXME - skip full-size chunks over e.g. 90% full?
2911  if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2912  rc = c;
2913 
2914  le = le->Flink;
2915  }
2916 
2917  ExReleaseResourceLite(&Vcb->chunk_lock);
2918 
2919  if (!rc) {
2920  ExReleaseResourceLite(&Vcb->tree_lock);
2921  break;
2922  }
2923 
2924  if (rc->list_entry_balance.Flink) {
2926  Vcb->balance.chunks_left--;
2927  }
2928 
2929  rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2930  rc->reloc = true;
2931 
2932  ExReleaseResourceLite(&Vcb->tree_lock);
2933 
2934  do {
2935  changed = false;
2936 
2937  Status = balance_data_chunk(Vcb, rc, &changed);
2938  if (!NT_SUCCESS(Status)) {
2939  ERR("balance_data_chunk returned %08x\n", Status);
2940  Vcb->balance.status = Status;
2942  rc->reloc = false;
2943  return Status;
2944  }
2945 
2946  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2947 
2948  if (Vcb->readonly)
2949  Vcb->balance.stopping = true;
2950 
2951  if (Vcb->balance.stopping)
2952  return STATUS_SUCCESS;
2953  } while (changed);
2954 
2956 
2957  rc->changed = true;
2958  rc->space_changed = true;
2959  rc->balance_num = Vcb->balance.balance_num;
2960 
2961  Status = do_write(Vcb, NULL);
2962  if (!NT_SUCCESS(Status)) {
2963  ERR("do_write returned %08x\n", Status);
2964  return Status;
2965  }
2966 
2967  free_trees(Vcb);
2968  }
2969 
2970  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2971 
2972  Status = alloc_chunk(Vcb, flags, &rc, true);
2973 
2974  ExReleaseResourceLite(&Vcb->chunk_lock);
2975 
2976  if (NT_SUCCESS(Status)) {
2977  *newchunk = rc;
2978  return Status;
2979  } else {
2980  ERR("alloc_chunk returned %08x\n", Status);
2981  return Status;
2982  }
2983 }
2984 
2986  LIST_ENTRY* le;
2987 
2988  while (!IsListEmpty(&dev->space)) {
2990 
2991  ExFreePool(s);
2992  }
2993 
2994  // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
2995 
2996  space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
2997 
2998  le = Vcb->chunks.Flink;
2999  while (le != &Vcb->chunks) {
3000  uint16_t n;
3002  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3003 
3004  for (n = 0; n < c->chunk_item->num_stripes; n++) {
3005  uint64_t stripe_size = 0;
3006 
3007  if (cis[n].dev_id == dev->devitem.dev_id) {
3008  if (stripe_size == 0) {
3009  uint16_t factor;
3010 
3011  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3012  factor = c->chunk_item->num_stripes;
3013  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3014  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3015  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3016  factor = c->chunk_item->num_stripes - 1;
3017  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3018  factor = c->chunk_item->num_stripes - 2;
3019  else // SINGLE, DUP, RAID1
3020  factor = 1;
3021 
3022  stripe_size = c->chunk_item->size / factor;
3023  }
3024 
3025  space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3026  }
3027  }
3028 
3029  le = le->Flink;
3030  }
3031 
3032  return STATUS_SUCCESS;
3033 }
3034 
3035 _Function_class_(KSTART_ROUTINE)
3036 void __stdcall balance_thread(void* context) {
3038  LIST_ENTRY chunks;
3039  LIST_ENTRY* le;
3040  uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3041  uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3042  NTSTATUS Status;
3043 
3044  Vcb->balance.balance_num++;
3045 
3046  Vcb->balance.stopping = false;
3047  KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3048 
3049  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3050  old_data_flags = Vcb->data_flags;
3051  Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3052 
3054  }
3055 
3056  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3057  old_metadata_flags = Vcb->metadata_flags;
3058  Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3059  }
3060 
3061  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3062  old_system_flags = Vcb->system_flags;
3063  Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3064  }
3065 
3066  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3067  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3068  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3069  else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3070  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3071  }
3072 
3073  num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3074  Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3075 
3076  InitializeListHead(&chunks);
3077 
3078  // FIXME - what are we supposed to do with limit_start?
3079 
3080  if (!Vcb->readonly) {
3081  if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3083  if (!NT_SUCCESS(Status)) {
3084  ERR("add_balance_item returned %08x\n", Status);
3085  Vcb->balance.status = Status;
3086  goto end;
3087  }
3088  } else {
3089  if (Vcb->need_write) {
3090  Status = do_write(Vcb, NULL);
3091 
3092  free_trees(Vcb);
3093 
3094  if (!NT_SUCCESS(Status)) {
3095  ERR("do_write returned %08x\n", Status);
3096  Vcb->balance.status = Status;
3097  goto end;
3098  }
3099  }
3100  }
3101  }
3102 
3103  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3104 
3105  if (Vcb->balance.stopping)
3106  goto end;
3107 
3108  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3109 
3110  le = Vcb->chunks.Flink;
3111  while (le != &Vcb->chunks) {
3113  uint8_t sort;
3114 
3116 
3117  if (c->chunk_item->type & BLOCK_FLAG_DATA)
3119  else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3121  else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3123  else {
3124  ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3126  break;
3127  }
3128 
3129  if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3131  InsertTailList(&chunks, &c->list_entry_balance);
3132 
3133  num_chunks[sort]++;
3134  Vcb->balance.total_chunks++;
3135  Vcb->balance.chunks_left++;
3136  } else if (sort == BALANCE_OPTS_METADATA)
3137  okay_metadata_chunks++;
3138  else if (sort == BALANCE_OPTS_DATA)
3139  okay_data_chunks++;
3140  else if (sort == BALANCE_OPTS_SYSTEM)
3141  okay_system_chunks++;
3142 
3143  if (!c->cache_loaded) {
3145 
3146  if (!NT_SUCCESS(Status)) {
3147  ERR("load_cache_chunk returned %08x\n", Status);
3148  Vcb->balance.status = Status;
3150  ExReleaseResourceLite(&Vcb->chunk_lock);
3151  goto end;
3152  }
3153  }
3154 
3156 
3157  le = le->Flink;
3158  }
3159 
3160  ExReleaseResourceLite(&Vcb->chunk_lock);
3161 
3162  // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3163  if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3164  bool consolidated = false;
3165  chunk* c;
3166 
3167  if (okay_metadata_chunks == 0) {
3168  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3169 
3170  Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3171  if (NT_SUCCESS(Status))
3172  c->balance_num = Vcb->balance.balance_num;
3173  else if (Status != STATUS_DISK_FULL || consolidated) {
3174  ERR("alloc_chunk returned %08x\n", Status);
3175  ExReleaseResourceLite(&Vcb->chunk_lock);
3176  Vcb->balance.status = Status;
3177  goto end;
3178  }
3179 
3180  ExReleaseResourceLite(&Vcb->chunk_lock);
3181 
3182  if (Status == STATUS_DISK_FULL) {
3183  Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3184  if (!NT_SUCCESS(Status)) {
3185  ERR("try_consolidation returned %08x\n", Status);
3186  Vcb->balance.status = Status;
3187  goto end;
3188  } else
3189  c->balance_num = Vcb->balance.balance_num;
3190 
3191  consolidated = true;
3192 
3193  if (Vcb->balance.stopping)
3194  goto end;
3195  }
3196  }
3197 
3198  if (okay_data_chunks == 0) {
3199  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3200 
3201  Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3202  if (NT_SUCCESS(Status))
3203  c->balance_num = Vcb->balance.balance_num;
3204  else if (Status != STATUS_DISK_FULL || consolidated) {
3205  ERR("alloc_chunk returned %08x\n", Status);
3206  ExReleaseResourceLite(&Vcb->chunk_lock);
3207  Vcb->balance.status = Status;
3208  goto end;
3209  }
3210 
3211  ExReleaseResourceLite(&Vcb->chunk_lock);
3212 
3213  if (Status == STATUS_DISK_FULL) {
3214  Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3215  if (!NT_SUCCESS(Status)) {
3216  ERR("try_consolidation returned %08x\n", Status);
3217  Vcb->balance.status = Status;
3218  goto end;
3219  } else
3220  c->balance_num = Vcb->balance.balance_num;
3221 
3222  consolidated = true;
3223 
3224  if (Vcb->balance.stopping)
3225  goto end;
3226  }
3227  }
3228 
3229  if (okay_system_chunks == 0) {
3230  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3231 
3232  Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3233  if (NT_SUCCESS(Status))
3234  c->balance_num = Vcb->balance.balance_num;
3235  else if (Status != STATUS_DISK_FULL || consolidated) {
3236  ERR("alloc_chunk returned %08x\n", Status);
3237  ExReleaseResourceLite(&Vcb->chunk_lock);
3238  Vcb->balance.status = Status;
3239  goto end;
3240  }
3241 
3242  ExReleaseResourceLite(&Vcb->chunk_lock);
3243 
3244  if (Status == STATUS_DISK_FULL) {
3245  Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3246  if (!NT_SUCCESS(Status)) {
3247  ERR("try_consolidation returned %08x\n", Status);
3248  Vcb->balance.status = Status;
3249  goto end;
3250  } else
3251  c->balance_num = Vcb->balance.balance_num;
3252 
3253  consolidated = true;
3254 
3255  if (Vcb->balance.stopping)
3256  goto end;
3257  }
3258  }
3259  }
3260 
3261  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3262 
3263  le = chunks.Flink;
3264  while (le != &chunks) {
3265  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3266 
3267  c->reloc = true;
3268 
3269  le = le->Flink;
3270  }
3271 
3272  ExReleaseResourceLite(&Vcb->chunk_lock);
3273 
3274  // do data chunks before metadata
3275  le = chunks.Flink;
3276  while (le != &chunks) {
3277  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3278  LIST_ENTRY* le2 = le->Flink;
3279 
3280  if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3281  bool changed;
3282 
3283  do {
3284  changed = false;
3285 
3286  Status = balance_data_chunk(Vcb, c, &changed);
3287  if (!NT_SUCCESS(Status)) {
3288  ERR("balance_data_chunk returned %08x\n", Status);
3289  Vcb->balance.status = Status;
3290  goto end;
3291  }
3292 
3293  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3294 
3295  if (Vcb->readonly)
3296  Vcb->balance.stopping = true;
3297 
3298  if (Vcb->balance.stopping)
3299  break;
3300  } while (changed);
3301 
3302  c->changed = true;
3303  c->space_changed = true;
3304  }
3305 
3306  if (Vcb->balance.stopping)
3307  goto end;
3308 
3309  if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3310  (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3311  RemoveEntryList(&c->list_entry_balance);
3312  c->list_entry_balance.Flink = NULL;
3313 
3314  Vcb->balance.chunks_left--;
3315  }
3316 
3317  le = le2;
3318  }
3319 
3320  // do metadata chunks
3321  while (!IsListEmpty(&chunks)) {
3322  chunk* c;
3323  bool changed;
3324 
3325  le = RemoveHeadList(&chunks);
3326  c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3327 
3328  if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3329  do {
3330  Status = balance_metadata_chunk(Vcb, c, &changed);
3331  if (!NT_SUCCESS(Status)) {
3332  ERR("balance_metadata_chunk returned %08x\n", Status);
3333  Vcb->balance.status = Status;
3334  goto end;
3335  }
3336 
3337  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3338 
3339  if (Vcb->readonly)
3340  Vcb->balance.stopping = true;
3341 
3342  if (Vcb->balance.stopping)
3343  break;
3344  } while (changed);
3345 
3346  c->changed = true;
3347  c->space_changed = true;
3348  }
3349 
3350  if (Vcb->balance.stopping)
3351  break;
3352 
3353  c->list_entry_balance.Flink = NULL;
3354 
3355  Vcb->balance.chunks_left--;
3356  }
3357 
3358 end:
3359  if (!Vcb->readonly) {
3360  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3361  le = chunks.Flink;
3362  while (le != &chunks) {
3363  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3364  c->reloc = false;
3365 
3366  le = le->Flink;
3367  c->list_entry_balance.Flink = NULL;
3368  }
3369 
3370  if (old_data_flags != 0)
3371  Vcb->data_flags = old_data_flags;
3372 
3373  if (old_metadata_flags != 0)
3374  Vcb->metadata_flags = old_metadata_flags;
3375 
3376  if (old_system_flags != 0)
3377  Vcb->system_flags = old_system_flags;
3378  }
3379 
3380  if (Vcb->balance.removing) {
3381  device* dev = NULL;
3382 
3383  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3384 
3385  le = Vcb->devices.Flink;
3386  while (le != &Vcb->devices) {
3387  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3388 
3389  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3390  dev = dev2;
3391  break;
3392  }
3393 
3394  le = le->Flink;
3395  }
3396 
3397  if (dev) {
3398  if (Vcb->balance.chunks_left == 0) {
3400 
3401  if (!NT_SUCCESS(Status)) {
3402  ERR("finish_removing_device returned %08x\n", Status);
3403  dev->reloc = false;
3404  }
3405  } else
3406  dev->reloc = false;
3407  }
3408 
3409  ExReleaseResourceLite(&Vcb->tree_lock);
3410  } else if (Vcb->balance.shrinking) {
3411  device* dev = NULL;
3412 
3413  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3414 
3415  le = Vcb->devices.Flink;
3416  while (le != &Vcb->devices) {
3417  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3418 
3419  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3420  dev = dev2;
3421  break;
3422  }
3423 
3424  le = le->Flink;
3425  }
3426 
3427  if (!dev) {
3428  ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3429  Vcb->balance.status = STATUS_INTERNAL_ERROR;
3430  }
3431 
3432  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3433  if (dev) {
3435  if (!NT_SUCCESS(Status))
3436  WARN("regenerate_space_list returned %08x\n", Status);
3437  }
3438  } else {
3439  uint64_t old_size;
3440 
3441  old_size = dev->devitem.num_bytes;
3442  dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3443 
3445  if (!NT_SUCCESS(Status)) {
3446  ERR("update_dev_item returned %08x\n", Status);
3447  dev->devitem.num_bytes = old_size;
3448  Vcb->balance.status = Status;
3449 
3451  if (!NT_SUCCESS(Status))
3452  WARN("regenerate_space_list returned %08x\n", Status);
3453  } else {
3454  Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3455 
3456  Status = do_write(Vcb, NULL);
3457  if (!NT_SUCCESS(Status))
3458  ERR("do_write returned %08x\n", Status);
3459 
3460  free_trees(Vcb);
3461  }
3462  }
3463 
3464  ExReleaseResourceLite(&Vcb->tree_lock);
3465 
3466  if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3468  } else {
3470  if (!NT_SUCCESS(Status)) {
3471  ERR("remove_balance_item returned %08x\n", Status);
3472  goto end;
3473  }
3474  }
3475 
3476  if (Vcb->trim && !Vcb->options.no_trim) {
3477  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3478 
3479  le = Vcb->devices.Flink;
3480  while (le != &Vcb->devices) {
3481  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3482 
3483  if (dev2->devobj && !dev2->readonly && dev2->trim)
3484  trim_unalloc_space(Vcb, dev2);
3485 
3486  le = le->Flink;
3487  }
3488 
3489  ExReleaseResourceLite(&Vcb->tree_lock);
3490  }
3491  }
3492 
3493  ZwClose(Vcb->balance.thread);
3494  Vcb->balance.thread = NULL;
3495 
3496  KeSetEvent(&Vcb->balance.finished, 0, false);
3497 }
3498 
3500  NTSTATUS Status;
3502  uint8_t i;
3503 
3504  if (length < sizeof(btrfs_start_balance) || !data)
3505  return STATUS_INVALID_PARAMETER;
3506 
3507  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3509 
3510  if (Vcb->locked) {
3511  WARN("cannot start balance while locked\n");
3512  return STATUS_DEVICE_NOT_READY;
3513  }
3514 
3515  if (Vcb->scrub.thread) {
3516  WARN("cannot start balance while scrub running\n");
3517  return STATUS_DEVICE_NOT_READY;
3518  }
3519 
3520  if (Vcb->balance.thread) {
3521  WARN("balance already running\n");
3522  return STATUS_DEVICE_NOT_READY;
3523  }
3524 
3525  if (Vcb->readonly)
3527 
3531  return STATUS_SUCCESS;
3532 
3533  for (i = 0; i < 3; i++) {
3534  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3535  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3538 
3539  if (bsb->opts[i].profiles == 0)
3540  return STATUS_INVALID_PARAMETER;
3541  }
3542 
3543  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3544  if (bsb->opts[i].devid == 0)
3545  return STATUS_INVALID_PARAMETER;
3546  }
3547 
3548  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3549  if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3550  return STATUS_INVALID_PARAMETER;
3551  }
3552 
3553  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3554  if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3555  return STATUS_INVALID_PARAMETER;
3556  }
3557 
3558  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3559  bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3560  bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3561 
3562  if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3563  return STATUS_INVALID_PARAMETER;
3564  }
3565 
3566  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3567  bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3568  bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3569 
3570  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3571  return STATUS_INVALID_PARAMETER;
3572  }
3573 
3574  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3575  bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3576  bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3577 
3578  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3579  return STATUS_INVALID_PARAMETER;
3580  }
3581 
3582  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3583  if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3585  bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3586  bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3587  return STATUS_INVALID_PARAMETER;
3588  }
3589  }
3590  }
3591 
3592  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3595 
3596  Vcb->balance.paused = false;
3597  Vcb->balance.removing = false;
3598  Vcb->balance.shrinking = false;
3599  Vcb->balance.status = STATUS_SUCCESS;
3600  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3601 
3602  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3603  if (!NT_SUCCESS(Status)) {
3604  ERR("PsCreateSystemThread returned %08x\n", Status);
3605  return Status;
3606  }
3607 
3608  return STATUS_SUCCESS;
3609 }
3610 
3612  KEY searchkey;
3613  traverse_ptr tp;
3614  NTSTATUS Status;
3615  BALANCE_ITEM* bi;
3616  int i;
3617 
3618  searchkey.obj_id = BALANCE_ITEM_ID;
3619  searchkey.obj_type = TYPE_TEMP_ITEM;
3620  searchkey.offset = 0;
3621 
3622  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3623  if (!NT_SUCCESS(Status)) {
3624  ERR("find_item returned %08x\n", Status);
3625  return Status;
3626  }
3627 
3628  if (keycmp(tp.item->key, searchkey)) {
3629  TRACE("no balance item found\n");
3630  return STATUS_NOT_FOUND;
3631  }
3632 
3633  if (tp.item->size < sizeof(BALANCE_ITEM)) {
3634  WARN("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3635  tp.item->size, sizeof(BALANCE_ITEM));
3636  return STATUS_INTERNAL_ERROR;
3637  }
3638 
3639  bi = (BALANCE_ITEM*)tp.item->data;
3640 
3641  if (bi->flags & BALANCE_FLAGS_DATA)
3642  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3643 
3644  if (bi->flags & BALANCE_FLAGS_METADATA)
3645  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3646 
3647  if (bi->flags & BALANCE_FLAGS_SYSTEM)
3648  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3649 
3650  // do the heuristics that Linux driver does
3651 
3652  for (i = 0; i < 3; i++) {
3653  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3654  // if converting, don't redo chunks already done
3655 
3656  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3657  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3658 
3659  // don't balance chunks more than 90% filled - presumably these
3660  // have already been done
3661 
3662  if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3663  !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3664  ) {
3665  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3666  Vcb->balance.opts[i].usage_start = 0;
3667  Vcb->balance.opts[i].usage_end = 90;
3668  }
3669  }
3670  }
3671 
3672  if (Vcb->readonly || Vcb->options.skip_balance)
3673  Vcb->balance.paused = true;
3674  else
3675  Vcb->balance.paused = false;
3676 
3677  Vcb->balance.removing = false;
3678  Vcb->balance.shrinking = false;
3679  Vcb->balance.status = STATUS_SUCCESS;
3680  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3681 
3682  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3683  if (!NT_SUCCESS(Status)) {
3684  ERR("PsCreateSystemThread returned %08x\n", Status);
3685  return Status;
3686  }
3687 
3688  return STATUS_SUCCESS;
3689 }
3690 
3693 
3694  if (length < sizeof(btrfs_query_balance) || !data)
3695  return STATUS_INVALID_PARAMETER;
3696 
3697  if (!Vcb->balance.thread) {
3699 
3700  if (!NT_SUCCESS(Vcb->balance.status)) {
3701  bqb->status |= BTRFS_BALANCE_ERROR;
3702  bqb->error = Vcb->balance.status;
3703  }
3704 
3705  return STATUS_SUCCESS;
3706  }
3707 
3708  bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3709 
3710  if (Vcb->balance.removing)
3711  bqb->status |= BTRFS_BALANCE_REMOVAL;
3712 
3713  if (Vcb->balance.shrinking)
3715 
3716  if (!NT_SUCCESS(Vcb->balance.status))
3717  bqb->status |= BTRFS_BALANCE_ERROR;
3718 
3719  bqb->chunks_left = Vcb->balance.chunks_left;
3720  bqb->total_chunks = Vcb->balance.total_chunks;
3721  bqb->error = Vcb->balance.status;
3722  RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3723  RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3724  RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3725 
3726  return STATUS_SUCCESS;
3727 }
3728 
3730  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3732 
3733  if (!Vcb->balance.thread)
3734  return STATUS_DEVICE_NOT_READY;
3735 
3736  if (Vcb->balance.paused)
3737  return STATUS_DEVICE_NOT_READY;
3738 
3739  Vcb->balance.paused = true;
3740  KeClearEvent(&Vcb->balance.event);
3741 
3742  return STATUS_SUCCESS;
3743 }
3744 
3746  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3748 
3749  if (!Vcb->balance.thread)
3750  return STATUS_DEVICE_NOT_READY;
3751 
3752  if (!Vcb->balance.paused)
3753  return STATUS_DEVICE_NOT_READY;
3754 
3755  if (Vcb->readonly)
3757 
3758  Vcb->balance.paused = false;
3759  KeSetEvent(&Vcb->balance.event, 0, false);
3760 
3761  return STATUS_SUCCESS;
3762 }
3763 
3765  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3767 
3768  if (!Vcb->balance.thread)
3769  return STATUS_DEVICE_NOT_READY;
3770 
3771  Vcb->balance.paused = false;
3772  Vcb->balance.stopping = true;
3773  Vcb->balance.status = STATUS_SUCCESS;
3774  KeSetEvent(&Vcb->balance.event, 0, false);
3775 
3776  return STATUS_SUCCESS;
3777 }
3778 
3780  uint64_t devid;
3781  LIST_ENTRY* le;
3782  device* dev = NULL;
3783  NTSTATUS Status;
3784  int i;
3785  uint64_t num_rw_devices;
3786 
3787  TRACE("(%p, %p, %x)\n", Vcb, data, length);
3788 
3789  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3791 
3792  if (length < sizeof(uint64_t))
3793  return STATUS_INVALID_PARAMETER;
3794 
3795  devid = *(uint64_t*)data;
3796 
3797  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3798 
3799  if (Vcb->readonly) {
3800  ExReleaseResourceLite(&Vcb->tree_lock);
3802  }
3803 
3804  num_rw_devices = 0;
3805 
3806  le = Vcb->devices.Flink;
3807  while (le != &Vcb->devices) {
3808  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3809 
3810  if (dev2->devitem.dev_id == devid)
3811  dev = dev2;
3812 
3813  if (!dev2->readonly)
3814  num_rw_devices++;
3815 
3816  le = le->Flink;
3817  }
3818 
3819  if (!dev) {
3820  ExReleaseResourceLite(&Vcb->tree_lock);
3821  WARN("device %I64x not found\n", devid);
3822  return STATUS_NOT_FOUND;
3823  }
3824 
3825  if (!dev->readonly) {
3826  if (num_rw_devices == 1) {
3827  ExReleaseResourceLite(&Vcb->tree_lock);
3828  WARN("not removing last non-readonly device\n");
3829  return STATUS_INVALID_PARAMETER;
3830  }
3831 
3832  if (num_rw_devices == 4 &&
3833  ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3834  (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3835  ) {
3836  ExReleaseResourceLite(&Vcb->tree_lock);
3837  ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3838  return STATUS_CANNOT_DELETE;
3839  }
3840 
3841  if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3842  ExReleaseResourceLite(&Vcb->tree_lock);
3843  ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3844  return STATUS_CANNOT_DELETE;
3845  }
3846 
3847  if (num_rw_devices == 2 &&
3848  ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3849  (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3850  ) {
3851  ExReleaseResourceLite(&Vcb->tree_lock);
3852  ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3853  return STATUS_CANNOT_DELETE;
3854  }
3855  }
3856 
3857  ExReleaseResourceLite(&Vcb->tree_lock);
3858 
3859  if (Vcb->balance.thread) {
3860  WARN("balance already running\n");
3861  return STATUS_DEVICE_NOT_READY;
3862  }
3863 
3864  dev->reloc = true;
3865 
3866  RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3867 
3868  for (i = 0; i < 3; i++) {
3869  Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3870  Vcb->balance.opts[i]