ReactOS  0.4.14-dev-833-g5f692ed
balance.c
Go to the documentation of this file.
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21 
22 typedef struct {
27  tree* t;
28  bool system;
32 
33 typedef struct {
36 
37  union {
40  };
41 
43  bool top;
46 
47 typedef struct {
55 } data_reloc;
56 
57 typedef struct {
60 
61  union {
64  };
65 
69 
70 #ifndef _MSC_VER // not in mingw yet
71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
72 #endif
73 
74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
75 
77  bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
79  metadata_reloc* mr;
80  EXTENT_ITEM* ei;
81  uint16_t len;
82  uint64_t inline_rc;
83  uint8_t* ptr;
84 
86  if (!mr) {
87  ERR("out of memory\n");
89  }
90 
91  mr->address = tp->item->key.obj_id;
92  mr->data = NULL;
93  mr->ei = (EXTENT_ITEM*)tp->item->data;
94  mr->system = false;
96 
98  if (!NT_SUCCESS(Status)) {
99  ERR("delete_tree_item returned %08x\n", Status);
100  ExFreePool(mr);
101  return Status;
102  }
103 
104  if (!c)
106 
107  if (c) {
109 
110  c->used -= Vcb->superblock.node_size;
111 
112  space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
113 
115  }
116 
117  ei = (EXTENT_ITEM*)tp->item->data;
118  inline_rc = 0;
119 
120  len = tp->item->size - sizeof(EXTENT_ITEM);
121  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
122  if (!skinny) {
123  len -= sizeof(EXTENT_ITEM2);
124  ptr += sizeof(EXTENT_ITEM2);
125  }
126 
127  while (len > 0) {
128  uint8_t secttype = *ptr;
129  uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
131 
132  len--;
133 
134  if (sectlen > len) {
135  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
136  return STATUS_INTERNAL_ERROR;
137  }
138 
139  if (sectlen == 0) {
140  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
141  return STATUS_INTERNAL_ERROR;
142  }
143 
145  if (!ref) {
146  ERR("out of memory\n");
148  }
149 
150  if (secttype == TYPE_TREE_BLOCK_REF) {
151  ref->type = TYPE_TREE_BLOCK_REF;
152  RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
153  inline_rc++;
154  } else if (secttype == TYPE_SHARED_BLOCK_REF) {
155  ref->type = TYPE_SHARED_BLOCK_REF;
156  RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
157  inline_rc++;
158  } else {
159  ERR("unexpected tree type %x\n", secttype);
160  ExFreePool(ref);
161  return STATUS_INTERNAL_ERROR;
162  }
163 
164  ref->parent = NULL;
165  ref->top = false;
167 
168  len -= sectlen;
169  ptr += sizeof(uint8_t) + sectlen;
170  }
171 
172  if (inline_rc < ei->refcount) { // look for non-inline entries
173  traverse_ptr tp2 = *tp, next_tp;
174 
175  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
176  tp2 = next_tp;
177 
178  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
179  if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
181  if (!ref) {
182  ERR("out of memory\n");
184  }
185 
186  ref->type = TYPE_TREE_BLOCK_REF;
187  ref->tbr.offset = tp2.item->key.offset;
188  ref->parent = NULL;
189  ref->top = false;
191 
192  Status = delete_tree_item(Vcb, &tp2);
193  if (!NT_SUCCESS(Status)) {
194  ERR("delete_tree_item returned %08x\n", Status);
195  return Status;
196  }
197  } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
199  if (!ref) {
200  ERR("out of memory\n");
202  }
203 
204  ref->type = TYPE_SHARED_BLOCK_REF;
205  ref->sbr.offset = tp2.item->key.offset;
206  ref->parent = NULL;
207  ref->top = false;
209 
210  Status = delete_tree_item(Vcb, &tp2);
211  if (!NT_SUCCESS(Status)) {
212  ERR("delete_tree_item returned %08x\n", Status);
213  return Status;
214  }
215  }
216  } else
217  break;
218  }
219  }
220 
222 
223  if (mr2)
224  *mr2 = mr;
225 
226  return STATUS_SUCCESS;
227 }
228 
231  LIST_ENTRY* le;
232  KEY searchkey;
234  bool skinny = false;
236 
237  le = items->Flink;
238  while (le != items) {
240 
241  if (mr->address == address) {
242  *mr2 = mr;
243  return STATUS_SUCCESS;
244  }
245 
246  le = le->Flink;
247  }
248 
249  searchkey.obj_id = address;
250  searchkey.obj_type = TYPE_METADATA_ITEM;
251  searchkey.offset = 0xffffffffffffffff;
252 
253  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
254  if (!NT_SUCCESS(Status)) {
255  ERR("find_item returned %08x\n", Status);
256  return Status;
257  }
258 
260  skinny = true;
261  else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
262  tp.item->size >= sizeof(EXTENT_ITEM)) {
264 
265  if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
266  ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
267  return STATUS_INTERNAL_ERROR;
268  }
269  } else {
270  ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
271  return STATUS_INTERNAL_ERROR;
272  }
273 
274  Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
275  if (!NT_SUCCESS(Status)) {
276  ERR("add_metadata_reloc returned %08x\n", Status);
277  return Status;
278  }
279 
280  return STATUS_SUCCESS;
281 }
282 
284  LIST_ENTRY newlist, *le;
285 
286  if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
287  return;
288 
289  // insertion sort
290 
291  InitializeListHead(&newlist);
292 
293  while (!IsListEmpty(&mr->refs)) {
295  bool inserted = false;
296 
297  if (ref->type == TYPE_TREE_BLOCK_REF)
298  ref->hash = ref->tbr.offset;
299  else if (ref->type == TYPE_SHARED_BLOCK_REF)
300  ref->hash = ref->parent->new_address;
301 
302  le = newlist.Flink;
303  while (le != &newlist) {
305 
306  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
308  inserted = true;
309  break;
310  }
311 
312  le = le->Flink;
313  }
314 
315  if (!inserted)
316  InsertTailList(&newlist, &ref->list_entry);
317  }
318 
319  newlist.Flink->Blink = &mr->refs;
320  newlist.Blink->Flink = &mr->refs;
321  mr->refs.Flink = newlist.Flink;
322  mr->refs.Blink = newlist.Blink;
323 }
324 
327  LIST_ENTRY* le;
328  uint64_t rc = 0;
329  uint16_t inline_len;
330  bool all_inline = true;
331  metadata_reloc_ref* first_noninline = NULL;
332  EXTENT_ITEM* ei;
333  uint8_t* ptr;
334 
335  inline_len = sizeof(EXTENT_ITEM);
336  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
337  inline_len += sizeof(EXTENT_ITEM2);
338 
340 
341  le = mr->refs.Flink;
342  while (le != &mr->refs) {
344  uint16_t extlen = 0;
345 
346  rc++;
347 
348  if (ref->type == TYPE_TREE_BLOCK_REF)
349  extlen += sizeof(TREE_BLOCK_REF);
350  else if (ref->type == TYPE_SHARED_BLOCK_REF)
351  extlen += sizeof(SHARED_BLOCK_REF);
352 
353  if (all_inline) {
354  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
355  all_inline = false;
356  first_noninline = ref;
357  } else
358  inline_len += extlen + 1;
359  }
360 
361  le = le->Flink;
362  }
363 
364  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
365  if (!ei) {
366  ERR("out of memory\n");
368  }
369 
370  ei->refcount = rc;
371  ei->generation = mr->ei->generation;
372  ei->flags = mr->ei->flags;
373  ptr = (uint8_t*)&ei[1];
374 
375  if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
376  EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
377 
378  ei2->firstitem = *(KEY*)&mr->data[1];
379  ei2->level = mr->data->level;
380 
381  ptr += sizeof(EXTENT_ITEM2);
382  }
383 
384  le = mr->refs.Flink;
385  while (le != &mr->refs) {
387 
388  if (ref == first_noninline)
389  break;
390 
391  *ptr = ref->type;
392  ptr++;
393 
394  if (ref->type == TYPE_TREE_BLOCK_REF) {
396 
397  tbr->offset = ref->tbr.offset;
398 
399  ptr += sizeof(TREE_BLOCK_REF);
400  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
402 
403  sbr->offset = ref->parent->new_address;
404 
405  ptr += sizeof(SHARED_BLOCK_REF);
406  }
407 
408  le = le->Flink;
409  }
410 
411  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
412  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
413  else
414  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
415 
416  if (!NT_SUCCESS(Status)) {
417  ERR("insert_tree_item returned %08x\n", Status);
418  ExFreePool(ei);
419  return Status;
420  }
421 
422  if (!all_inline) {
423  le = &first_noninline->list_entry;
424 
425  while (le != &mr->refs) {
427 
428  if (ref->type == TYPE_TREE_BLOCK_REF) {
429  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
430  if (!NT_SUCCESS(Status)) {
431  ERR("insert_tree_item returned %08x\n", Status);
432  return Status;
433  }
434  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
435  Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
436  if (!NT_SUCCESS(Status)) {
437  ERR("insert_tree_item returned %08x\n", Status);
438  return Status;
439  }
440  }
441 
442  le = le->Flink;
443  }
444  }
445 
447  if (mr->data->level > 0) {
448  uint16_t i;
449  internal_node* in = (internal_node*)&mr->data[1];
450 
451  for (i = 0; i < mr->data->num_items; i++) {
453 
454  if (sbrrc > 0) {
455  SHARED_BLOCK_REF sbr;
456 
457  sbr.offset = mr->new_address;
458 
459  Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
460  if (!NT_SUCCESS(Status)) {
461  ERR("increase_extent_refcount returned %08x\n", Status);
462  return Status;
463  }
464 
465  sbr.offset = mr->address;
466 
467  Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
468  sbr.offset, false, NULL);
469  if (!NT_SUCCESS(Status)) {
470  ERR("decrease_extent_refcount returned %08x\n", Status);
471  return Status;
472  }
473  }
474  }
475  } else {
476  uint16_t i;
477  leaf_node* ln = (leaf_node*)&mr->data[1];
478 
479  for (i = 0; i < mr->data->num_items; i++) {
480  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
481  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
482 
485 
486  if (ed2->size > 0) { // not sparse
488 
489  if (sdrrc > 0) {
490  SHARED_DATA_REF sdr;
491  chunk* c;
492 
493  sdr.offset = mr->new_address;
494  sdr.count = sdrrc;
495 
497  if (!NT_SUCCESS(Status)) {
498  ERR("increase_extent_refcount returned %08x\n", Status);
499  return Status;
500  }
501 
502  sdr.offset = mr->address;
503 
505  sdr.offset, false, NULL);
506  if (!NT_SUCCESS(Status)) {
507  ERR("decrease_extent_refcount returned %08x\n", Status);
508  return Status;
509  }
510 
512 
513  if (c) {
514  // check changed_extents
515 
516  ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
517 
518  le = c->changed_extents.Flink;
519 
520  while (le != &c->changed_extents) {
522 
523  if (ce->address == ed2->address) {
524  LIST_ENTRY* le2;
525 
526  le2 = ce->refs.Flink;
527  while (le2 != &ce->refs) {
529 
530  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
531  cer->sdr.offset = mr->new_address;
532  break;
533  }
534 
535  le2 = le2->Flink;
536  }
537 
538  le2 = ce->old_refs.Flink;
539  while (le2 != &ce->old_refs) {
541 
542  if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
543  cer->sdr.offset = mr->new_address;
544  break;
545  }
546 
547  le2 = le2->Flink;
548  }
549 
550  break;
551  }
552 
553  le = le->Flink;
554  }
555 
556  ExReleaseResourceLite(&c->changed_extents_lock);
557  }
558  }
559  }
560  }
561  }
562  }
563  }
564  }
565 
566  return STATUS_SUCCESS;
567 }
568 
570  LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
571  LIST_ENTRY tree_writes, *le;
574  uint8_t level, max_level = 0;
575  chunk* newchunk = NULL;
576 
577  InitializeListHead(&tree_writes);
578 
579  le = items->Flink;
580  while (le != items) {
582  LIST_ENTRY* le2;
583  chunk* pc;
584 
585  mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
586  if (!mr->data) {
587  ERR("out of memory\n");
589  }
590 
591  Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
592  c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
593  if (!NT_SUCCESS(Status)) {
594  ERR("read_data returned %08x\n", Status);
595  return Status;
596  }
597 
598  if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
599  mr->system = true;
600 
601  if (data_items && mr->data->level == 0) {
602  le2 = data_items->Flink;
603  while (le2 != data_items) {
605  leaf_node* ln = (leaf_node*)&mr->data[1];
606  uint16_t i;
607 
608  for (i = 0; i < mr->data->num_items; i++) {
609  if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
610  EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
611 
614 
615  if (ed2->address == dr->address)
616  ed2->address = dr->new_address;
617  }
618  }
619  }
620 
621  le2 = le2->Flink;
622  }
623  }
624 
625  if (mr->data->level > max_level)
626  max_level = mr->data->level;
627 
628  le2 = mr->refs.Flink;
629  while (le2 != &mr->refs) {
631 
632  if (ref->type == TYPE_TREE_BLOCK_REF) {
633  KEY* firstitem;
634  root* r = NULL;
635  LIST_ENTRY* le3;
636  tree* t;
637 
638  firstitem = (KEY*)&mr->data[1];
639 
640  le3 = Vcb->roots.Flink;
641  while (le3 != &Vcb->roots) {
643 
644  if (r2->id == ref->tbr.offset) {
645  r = r2;
646  break;
647  }
648 
649  le3 = le3->Flink;
650  }
651 
652  if (!r) {
653  ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
654  return STATUS_INTERNAL_ERROR;
655  }
656 
657  Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
659  ERR("find_item_to_level returned %08x\n", Status);
660  return Status;
661  }
662 
663  t = tp.tree;
664  while (t && t->header.level < mr->data->level + 1) {
665  t = t->parent;
666  }
667 
668  if (!t)
669  ref->top = true;
670  else {
671  metadata_reloc* mr2;
672 
673  Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
674  if (!NT_SUCCESS(Status)) {
675  ERR("add_metadata_reloc_parent returned %08x\n", Status);
676  return Status;
677  }
678 
679  ref->parent = mr2;
680  }
681  } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
682  metadata_reloc* mr2;
683 
684  Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
685  if (!NT_SUCCESS(Status)) {
686  ERR("add_metadata_reloc_parent returned %08x\n", Status);
687  return Status;
688  }
689 
690  ref->parent = mr2;
691  }
692 
693  le2 = le2->Flink;
694  }
695 
696  le = le->Flink;
697  }
698 
699  le = items->Flink;
700  while (le != items) {
702  LIST_ENTRY* le2;
703  uint32_t hash;
704 
705  mr->t = NULL;
706 
707  hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
708 
709  le2 = Vcb->trees_ptrs[hash >> 24];
710 
711  if (le2) {
712  while (le2 != &Vcb->trees_hash) {
713  tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
714 
715  if (t->header.address == mr->address) {
716  mr->t = t;
717  break;
718  } else if (t->hash > hash)
719  break;
720 
721  le2 = le2->Flink;
722  }
723  }
724 
725  le = le->Flink;
726  }
727 
728  for (level = 0; level <= max_level; level++) {
729  le = items->Flink;
730  while (le != items) {
732 
733  if (mr->data->level == level) {
734  bool done = false;
735  LIST_ENTRY* le2;
736  tree_write* tw;
737  uint64_t flags;
738  tree* t3;
739 
740  if (mr->system)
741  flags = Vcb->system_flags;
742  else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
743  flags = Vcb->data_flags;
744  else
745  flags = Vcb->metadata_flags;
746 
747  if (newchunk) {
748  acquire_chunk_lock(newchunk, Vcb);
749 
750  if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
751  newchunk->used += Vcb->superblock.node_size;
752  space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
753  done = true;
754  }
755 
756  release_chunk_lock(newchunk, Vcb);
757  }
758 
759  if (!done) {
760  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
761 
762  le2 = Vcb->chunks.Flink;
763  while (le2 != &Vcb->chunks) {
765 
766  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
767  acquire_chunk_lock(c2, Vcb);
768 
769  if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
771  c2->used += Vcb->superblock.node_size;
772  space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
773  release_chunk_lock(c2, Vcb);
774  newchunk = c2;
775  done = true;
776  break;
777  }
778  }
779 
780  release_chunk_lock(c2, Vcb);
781  }
782 
783  le2 = le2->Flink;
784  }
785 
786  // allocate new chunk if necessary
787  if (!done) {
788  Status = alloc_chunk(Vcb, flags, &newchunk, false);
789 
790  if (!NT_SUCCESS(Status)) {
791  ERR("alloc_chunk returned %08x\n", Status);
792  ExReleaseResourceLite(&Vcb->chunk_lock);
793  goto end;
794  }
795 
796  acquire_chunk_lock(newchunk, Vcb);
797 
798  newchunk->balance_num = Vcb->balance.balance_num;
799 
800  if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
801  release_chunk_lock(newchunk, Vcb);
802  ExReleaseResourceLite(&Vcb->chunk_lock);
803  ERR("could not find address in new chunk\n");
805  goto end;
806  } else {
807  newchunk->used += Vcb->superblock.node_size;
808  space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
809  }
810 
811  release_chunk_lock(newchunk, Vcb);
812  }
813 
814  ExReleaseResourceLite(&Vcb->chunk_lock);
815  }
816 
817  // update parents
818  le2 = mr->refs.Flink;
819  while (le2 != &mr->refs) {
821 
822  if (ref->parent) {
823  uint16_t i;
824  internal_node* in = (internal_node*)&ref->parent->data[1];
825 
826  for (i = 0; i < ref->parent->data->num_items; i++) {
827  if (in[i].address == mr->address) {
828  in[i].address = mr->new_address;
829  break;
830  }
831  }
832 
833  if (ref->parent->t) {
834  LIST_ENTRY* le3;
835 
836  le3 = ref->parent->t->itemlist.Flink;
837  while (le3 != &ref->parent->t->itemlist) {
839 
840  if (!td->inserted && td->treeholder.address == mr->address)
841  td->treeholder.address = mr->new_address;
842 
843  le3 = le3->Flink;
844  }
845  }
846  } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
847  LIST_ENTRY* le3;
848  root* r = NULL;
849 
850  // alter ROOT_ITEM
851 
852  le3 = Vcb->roots.Flink;
853  while (le3 != &Vcb->roots) {
855 
856  if (r2->id == ref->tbr.offset) {
857  r = r2;
858  break;
859  }
860 
861  le3 = le3->Flink;
862  }
863 
864  if (r) {
865  r->treeholder.address = mr->new_address;
866 
867  if (r == Vcb->root_root)
868  Vcb->superblock.root_tree_addr = mr->new_address;
869  else if (r == Vcb->chunk_root)
870  Vcb->superblock.chunk_tree_addr = mr->new_address;
871  else if (r->root_item.block_number == mr->address) {
872  KEY searchkey;
873  ROOT_ITEM* ri;
874 
875  r->root_item.block_number = mr->new_address;
876 
877  searchkey.obj_id = r->id;
878  searchkey.obj_type = TYPE_ROOT_ITEM;
879  searchkey.offset = 0xffffffffffffffff;
880 
881  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
882  if (!NT_SUCCESS(Status)) {
883  ERR("find_item returned %08x\n", Status);
884  goto end;
885  }
886 
887  if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
888  ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
890  goto end;
891  }
892 
894  if (!ri) {
895  ERR("out of memory\n");
897  goto end;
898  }
899 
900  RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
901 
903  if (!NT_SUCCESS(Status)) {
904  ERR("delete_tree_item returned %08x\n", Status);
905  goto end;
906  }
907 
908  Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
909  if (!NT_SUCCESS(Status)) {
910  ERR("insert_tree_item returned %08x\n", Status);
911  goto end;
912  }
913  }
914  }
915  }
916 
917  le2 = le2->Flink;
918  }
919 
920  mr->data->address = mr->new_address;
921 
922  t3 = mr->t;
923 
924  while (t3) {
925  uint8_t h;
926  bool inserted;
927  tree* t4 = NULL;
928 
929  // check if tree loaded more than once
930  if (t3->list_entry.Flink != &Vcb->trees_hash) {
931  tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
932 
933  if (nt->header.address == t3->header.address)
934  t4 = nt;
935  }
936 
937  t3->header.address = mr->new_address;
938 
939  h = t3->hash >> 24;
940 
941  if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
942  if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
943  Vcb->trees_ptrs[h] = NULL;
944  else {
945  tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
946 
947  if (t2->hash >> 24 == h)
948  Vcb->trees_ptrs[h] = &t2->list_entry_hash;
949  else
950  Vcb->trees_ptrs[h] = NULL;
951  }
952  }
953 
955 
956  t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
957  h = t3->hash >> 24;
958 
959  if (!Vcb->trees_ptrs[h]) {
960  uint8_t h2 = h;
961 
962  le2 = Vcb->trees_hash.Flink;
963 
964  if (h2 > 0) {
965  h2--;
966  do {
967  if (Vcb->trees_ptrs[h2]) {
968  le2 = Vcb->trees_ptrs[h2];
969  break;
970  }
971 
972  h2--;
973  } while (h2 > 0);
974  }
975  } else
976  le2 = Vcb->trees_ptrs[h];
977 
978  inserted = false;
979  while (le2 != &Vcb->trees_hash) {
980  tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
981 
982  if (t2->hash >= t3->hash) {
984  inserted = true;
985  break;
986  }
987 
988  le2 = le2->Flink;
989  }
990 
991  if (!inserted)
992  InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
993 
994  if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
995  Vcb->trees_ptrs[h] = &t3->list_entry_hash;
996 
997  if (data_items && level == 0) {
998  le2 = data_items->Flink;
999 
1000  while (le2 != data_items) {
1002  LIST_ENTRY* le3 = t3->itemlist.Flink;
1003 
1004  while (le3 != &t3->itemlist) {
1006 
1007  if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1008  EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1009 
1012 
1013  if (ed2->address == dr->address)
1014  ed2->address = dr->new_address;
1015  }
1016  }
1017 
1018  le3 = le3->Flink;
1019  }
1020 
1021  le2 = le2->Flink;
1022  }
1023  }
1024 
1025  t3 = t4;
1026  }
1027 
1028  *((uint32_t*)mr->data) = ~calc_crc32c(0xffffffff, (uint8_t*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1029 
1031  if (!tw) {
1032  ERR("out of memory\n");
1034  goto end;
1035  }
1036 
1037  tw->address = mr->new_address;
1038  tw->length = Vcb->superblock.node_size;
1039  tw->data = (uint8_t*)mr->data;
1040  tw->allocated = false;
1041 
1042  if (IsListEmpty(&tree_writes))
1043  InsertTailList(&tree_writes, &tw->list_entry);
1044  else {
1045  bool inserted = false;
1046 
1047  le2 = tree_writes.Flink;
1048  while (le2 != &tree_writes) {
1050 
1051  if (tw2->address > tw->address) {
1052  InsertHeadList(le2->Blink, &tw->list_entry);
1053  inserted = true;
1054  break;
1055  }
1056 
1057  le2 = le2->Flink;
1058  }
1059 
1060  if (!inserted)
1061  InsertTailList(&tree_writes, &tw->list_entry);
1062  }
1063  }
1064 
1065  le = le->Flink;
1066  }
1067  }
1068 
1069  Status = do_tree_writes(Vcb, &tree_writes, true);
1070  if (!NT_SUCCESS(Status)) {
1071  ERR("do_tree_writes returned %08x\n", Status);
1072  goto end;
1073  }
1074 
1075  le = items->Flink;
1076  while (le != items) {
1078 
1080  if (!NT_SUCCESS(Status)) {
1081  ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1082  goto end;
1083  }
1084 
1085  le = le->Flink;
1086  }
1087 
1089 
1090 end:
1091  while (!IsListEmpty(&tree_writes)) {
1093 
1094  if (tw->allocated)
1095  ExFreePool(tw->data);
1096 
1097  ExFreePool(tw);
1098  }
1099 
1100  return Status;
1101 }
1102 
1104  KEY searchkey;
1105  traverse_ptr tp;
1106  NTSTATUS Status;
1107  bool b;
1109  uint32_t loaded = 0;
1110 
1111  TRACE("chunk %I64x\n", c->offset);
1112 
1115 
1116  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1117 
1118  searchkey.obj_id = c->offset;
1119  searchkey.obj_type = TYPE_METADATA_ITEM;
1120  searchkey.offset = 0xffffffffffffffff;
1121 
1122  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1123  if (!NT_SUCCESS(Status)) {
1124  ERR("find_item returned %08x\n", Status);
1125  goto end;
1126  }
1127 
1128  do {
1129  traverse_ptr next_tp;
1130 
1131  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1132  break;
1133 
1134  if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1135  bool tree = false, skinny = false;
1136 
1137  if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1138  tree = true;
1139  skinny = true;
1140  } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1141  tp.item->size >= sizeof(EXTENT_ITEM)) {
1142  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1143 
1145  tree = true;
1146  }
1147 
1148  if (tree) {
1149  Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1150 
1151  if (!NT_SUCCESS(Status)) {
1152  ERR("add_metadata_reloc returned %08x\n", Status);
1153  goto end;
1154  }
1155 
1156  loaded++;
1157 
1158  if (loaded >= 64) // only do 64 at a time
1159  break;
1160  }
1161  }
1162 
1163  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1164 
1165  if (b)
1166  tp = next_tp;
1167  } while (b);
1168 
1169  if (IsListEmpty(&items)) {
1170  *changed = false;
1172  goto end;
1173  } else
1174  *changed = true;
1175 
1177  if (!NT_SUCCESS(Status)) {
1178  ERR("write_metadata_items returned %08x\n", Status);
1179  goto end;
1180  }
1181 
1183 
1184  Vcb->need_write = true;
1185 
1186 end:
1187  if (NT_SUCCESS(Status)) {
1188  Status = do_write(Vcb, NULL);
1189  if (!NT_SUCCESS(Status))
1190  ERR("do_write returned %08x\n", Status);
1191  }
1192 
1193  if (NT_SUCCESS(Status))
1195  else
1197 
1198  free_trees(Vcb);
1199 
1200  ExReleaseResourceLite(&Vcb->tree_lock);
1201 
1202  while (!IsListEmpty(&items)) {
1204 
1205  while (!IsListEmpty(&mr->refs)) {
1207 
1208  ExFreePool(ref);
1209  }
1210 
1211  if (mr->data)
1212  ExFreePool(mr->data);
1213 
1214  ExFreePool(mr);
1215  }
1216 
1217  return Status;
1218 }
1219 
1222  NTSTATUS Status;
1223  LIST_ENTRY* le;
1224  KEY searchkey;
1225  traverse_ptr tp;
1226  root* r = NULL;
1227  metadata_reloc* mr;
1228  uint64_t last_tree = 0;
1230 
1231  le = Vcb->roots.Flink;
1232  while (le != &Vcb->roots) {
1234 
1235  if (r2->id == edr->root) {
1236  r = r2;
1237  break;
1238  }
1239 
1240  le = le->Flink;
1241  }
1242 
1243  if (!r) {
1244  ERR("could not find subvol %I64x\n", edr->count);
1245  return STATUS_INTERNAL_ERROR;
1246  }
1247 
1248  searchkey.obj_id = edr->objid;
1249  searchkey.obj_type = TYPE_EXTENT_DATA;
1250  searchkey.offset = 0;
1251 
1252  Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1253  if (!NT_SUCCESS(Status)) {
1254  ERR("find_item returned %08x\n", Status);
1255  return Status;
1256  }
1257 
1258  if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1259  traverse_ptr tp2;
1260 
1261  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1262  tp = tp2;
1263  else {
1264  ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1265  return STATUS_INTERNAL_ERROR;
1266  }
1267  }
1268 
1269  ref = NULL;
1270 
1271  while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1272  traverse_ptr tp2;
1273 
1274  if (tp.item->size >= sizeof(EXTENT_DATA)) {
1276 
1279 
1280  if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1281  if (ref && last_tree == tp.tree->header.address)
1282  ref->edr.count++;
1283  else {
1285  if (!ref) {
1286  ERR("out of memory\n");
1288  }
1289 
1290  ref->type = TYPE_EXTENT_DATA_REF;
1291  RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1292  ref->edr.count = 1;
1293 
1294  Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1295  if (!NT_SUCCESS(Status)) {
1296  ERR("add_metadata_reloc_parent returned %08x\n", Status);
1297  ExFreePool(ref);
1298  return Status;
1299  }
1300 
1301  last_tree = tp.tree->header.address;
1302  ref->parent = mr;
1303 
1304  InsertTailList(&dr->refs, &ref->list_entry);
1305  }
1306  }
1307  }
1308  }
1309 
1310  if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1311  tp = tp2;
1312  else
1313  break;
1314  }
1315 
1316  return STATUS_SUCCESS;
1317 }
1318 
1321  NTSTATUS Status;
1322  data_reloc* dr;
1323  EXTENT_ITEM* ei;
1324  uint16_t len;
1325  uint64_t inline_rc;
1326  uint8_t* ptr;
1327 
1329  if (!dr) {
1330  ERR("out of memory\n");
1332  }
1333 
1334  dr->address = tp->item->key.obj_id;
1335  dr->size = tp->item->key.offset;
1336  dr->ei = (EXTENT_ITEM*)tp->item->data;
1337  InitializeListHead(&dr->refs);
1338 
1340  if (!NT_SUCCESS(Status)) {
1341  ERR("delete_tree_item returned %08x\n", Status);
1342  return Status;
1343  }
1344 
1345  if (!c)
1347 
1348  if (c) {
1350 
1351  c->used -= tp->item->key.offset;
1352 
1354 
1356  }
1357 
1358  ei = (EXTENT_ITEM*)tp->item->data;
1359  inline_rc = 0;
1360 
1361  len = tp->item->size - sizeof(EXTENT_ITEM);
1362  ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1363 
1364  while (len > 0) {
1365  uint8_t secttype = *ptr;
1366  uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1367 
1368  len--;
1369 
1370  if (sectlen > len) {
1371  ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1372  return STATUS_INTERNAL_ERROR;
1373  }
1374 
1375  if (sectlen == 0) {
1376  ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1377  return STATUS_INTERNAL_ERROR;
1378  }
1379 
1380  if (secttype == TYPE_EXTENT_DATA_REF) {
1381  EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1382 
1383  inline_rc += edr->count;
1384 
1385  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1386  if (!NT_SUCCESS(Status)) {
1387  ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1388  return Status;
1389  }
1390  } else if (secttype == TYPE_SHARED_DATA_REF) {
1391  metadata_reloc* mr;
1393 
1395  if (!ref) {
1396  ERR("out of memory\n");
1398  }
1399 
1400  ref->type = TYPE_SHARED_DATA_REF;
1401  RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1402  inline_rc += ref->sdr.count;
1403 
1404  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1405  if (!NT_SUCCESS(Status)) {
1406  ERR("add_metadata_reloc_parent returned %08x\n", Status);
1407  ExFreePool(ref);
1408  return Status;
1409  }
1410 
1411  ref->parent = mr;
1412 
1413  InsertTailList(&dr->refs, &ref->list_entry);
1414  } else {
1415  ERR("unexpected tree type %x\n", secttype);
1416  return STATUS_INTERNAL_ERROR;
1417  }
1418 
1419 
1420  len -= sectlen;
1421  ptr += sizeof(uint8_t) + sectlen;
1422  }
1423 
1424  if (inline_rc < ei->refcount) { // look for non-inline entries
1425  traverse_ptr tp2 = *tp, next_tp;
1426 
1427  while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1428  tp2 = next_tp;
1429 
1430  if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1431  if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1432  Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1433  if (!NT_SUCCESS(Status)) {
1434  ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1435  return Status;
1436  }
1437 
1438  Status = delete_tree_item(Vcb, &tp2);
1439  if (!NT_SUCCESS(Status)) {
1440  ERR("delete_tree_item returned %08x\n", Status);
1441  return Status;
1442  }
1443  } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1444  metadata_reloc* mr;
1446 
1448  if (!ref) {
1449  ERR("out of memory\n");
1451  }
1452 
1453  ref->type = TYPE_SHARED_DATA_REF;
1454  ref->sdr.offset = tp2.item->key.offset;
1455  ref->sdr.count = *((uint32_t*)tp2.item->data);
1456 
1457  Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1458  if (!NT_SUCCESS(Status)) {
1459  ERR("add_metadata_reloc_parent returned %08x\n", Status);
1460  ExFreePool(ref);
1461  return Status;
1462  }
1463 
1464  ref->parent = mr;
1465  InsertTailList(&dr->refs, &ref->list_entry);
1466 
1467  Status = delete_tree_item(Vcb, &tp2);
1468  if (!NT_SUCCESS(Status)) {
1469  ERR("delete_tree_item returned %08x\n", Status);
1470  return Status;
1471  }
1472  }
1473  } else
1474  break;
1475  }
1476  }
1477 
1479 
1480  return STATUS_SUCCESS;
1481 }
1482 
1484  LIST_ENTRY newlist, *le;
1485 
1486  if (IsListEmpty(&dr->refs))
1487  return;
1488 
1489  // insertion sort
1490 
1491  InitializeListHead(&newlist);
1492 
1493  while (!IsListEmpty(&dr->refs)) {
1495  bool inserted = false;
1496 
1497  if (ref->type == TYPE_EXTENT_DATA_REF)
1498  ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1499  else if (ref->type == TYPE_SHARED_DATA_REF)
1500  ref->hash = ref->parent->new_address;
1501 
1502  le = newlist.Flink;
1503  while (le != &newlist) {
1505 
1506  if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1508  inserted = true;
1509  break;
1510  }
1511 
1512  le = le->Flink;
1513  }
1514 
1515  if (!inserted)
1516  InsertTailList(&newlist, &ref->list_entry);
1517  }
1518 
1519  le = newlist.Flink;
1520  while (le != &newlist) {
1522 
1523  if (le->Flink != &newlist) {
1525 
1526  if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1527  ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1528  RemoveEntryList(&ref2->list_entry);
1529  ref->edr.count += ref2->edr.count;
1530  ExFreePool(ref2);
1531  continue;
1532  }
1533  }
1534 
1535  le = le->Flink;
1536  }
1537 
1538  newlist.Flink->Blink = &dr->refs;
1539  newlist.Blink->Flink = &dr->refs;
1540  dr->refs.Flink = newlist.Flink;
1541  dr->refs.Blink = newlist.Blink;
1542 }
1543 
1545  NTSTATUS Status;
1546  LIST_ENTRY* le;
1547  uint64_t rc = 0;
1548  uint16_t inline_len;
1549  bool all_inline = true;
1550  data_reloc_ref* first_noninline = NULL;
1551  EXTENT_ITEM* ei;
1552  uint8_t* ptr;
1553 
1554  inline_len = sizeof(EXTENT_ITEM);
1555 
1557 
1558  le = dr->refs.Flink;
1559  while (le != &dr->refs) {
1561  uint16_t extlen = 0;
1562 
1563  if (ref->type == TYPE_EXTENT_DATA_REF) {
1564  extlen += sizeof(EXTENT_DATA_REF);
1565  rc += ref->edr.count;
1566  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1567  extlen += sizeof(SHARED_DATA_REF);
1568  rc++;
1569  }
1570 
1571  if (all_inline) {
1572  if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1573  all_inline = false;
1574  first_noninline = ref;
1575  } else
1576  inline_len += extlen + 1;
1577  }
1578 
1579  le = le->Flink;
1580  }
1581 
1582  ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1583  if (!ei) {
1584  ERR("out of memory\n");
1586  }
1587 
1588  ei->refcount = rc;
1589  ei->generation = dr->ei->generation;
1590  ei->flags = dr->ei->flags;
1591  ptr = (uint8_t*)&ei[1];
1592 
1593  le = dr->refs.Flink;
1594  while (le != &dr->refs) {
1596 
1597  if (ref == first_noninline)
1598  break;
1599 
1600  *ptr = ref->type;
1601  ptr++;
1602 
1603  if (ref->type == TYPE_EXTENT_DATA_REF) {
1605 
1606  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1607 
1608  ptr += sizeof(EXTENT_DATA_REF);
1609  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1611 
1612  sdr->offset = ref->parent->new_address;
1613  sdr->count = ref->sdr.count;
1614 
1615  ptr += sizeof(SHARED_DATA_REF);
1616  }
1617 
1618  le = le->Flink;
1619  }
1620 
1621  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1622  if (!NT_SUCCESS(Status)) {
1623  ERR("insert_tree_item returned %08x\n", Status);
1624  return Status;
1625  }
1626 
1627  if (!all_inline) {
1628  le = &first_noninline->list_entry;
1629 
1630  while (le != &dr->refs) {
1632 
1633  if (ref->type == TYPE_EXTENT_DATA_REF) {
1634  EXTENT_DATA_REF* edr;
1635 
1637  if (!edr) {
1638  ERR("out of memory\n");
1640  }
1641 
1642  RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1643 
1644  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1645  if (!NT_SUCCESS(Status)) {
1646  ERR("insert_tree_item returned %08x\n", Status);
1647  return Status;
1648  }
1649  } else if (ref->type == TYPE_SHARED_DATA_REF) {
1650  uint32_t* sdr;
1651 
1653  if (!sdr) {
1654  ERR("out of memory\n");
1656  }
1657 
1658  *sdr = ref->sdr.count;
1659 
1660  Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1661  if (!NT_SUCCESS(Status)) {
1662  ERR("insert_tree_item returned %08x\n", Status);
1663  return Status;
1664  }
1665  }
1666 
1667  le = le->Flink;
1668  }
1669  }
1670 
1671  return STATUS_SUCCESS;
1672 }
1673 
1675  KEY searchkey;
1676  traverse_ptr tp;
1677  NTSTATUS Status;
1678  bool b;
1679  LIST_ENTRY items, metadata_items, rollback, *le;
1680  uint64_t loaded = 0, num_loaded = 0;
1681  chunk* newchunk = NULL;
1682  uint8_t* data = NULL;
1683 
1684  TRACE("chunk %I64x\n", c->offset);
1685 
1688  InitializeListHead(&metadata_items);
1689 
1690  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1691 
1692  searchkey.obj_id = c->offset;
1693  searchkey.obj_type = TYPE_EXTENT_ITEM;
1694  searchkey.offset = 0xffffffffffffffff;
1695 
1696  Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1697  if (!NT_SUCCESS(Status)) {
1698  ERR("find_item returned %08x\n", Status);
1699  goto end;
1700  }
1701 
1702  do {
1703  traverse_ptr next_tp;
1704 
1705  if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1706  break;
1707 
1708  if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1709  bool tree = false;
1710 
1711  if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1712  EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1713 
1715  tree = true;
1716  }
1717 
1718  if (!tree) {
1719  Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1720 
1721  if (!NT_SUCCESS(Status)) {
1722  ERR("add_data_reloc returned %08x\n", Status);
1723  goto end;
1724  }
1725 
1726  loaded += tp.item->key.offset;
1727  num_loaded++;
1728 
1729  if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1730  break;
1731  }
1732  }
1733 
1734  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1735 
1736  if (b)
1737  tp = next_tp;
1738  } while (b);
1739 
1740  if (IsListEmpty(&items)) {
1741  *changed = false;
1743  goto end;
1744  } else
1745  *changed = true;
1746 
1748  if (!data) {
1749  ERR("out of memory\n");
1751  goto end;
1752  }
1753 
1754  le = items.Flink;
1755  while (le != &items) {
1757  bool done = false;
1758  LIST_ENTRY* le2;
1759  uint32_t* csum;
1760  RTL_BITMAP bmp;
1761  ULONG* bmparr;
1762  ULONG bmplen, runlength, index, lastoff;
1763 
1764  if (newchunk) {
1765  acquire_chunk_lock(newchunk, Vcb);
1766 
1767  if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1768  newchunk->used += dr->size;
1769  space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1770  done = true;
1771  }
1772 
1773  release_chunk_lock(newchunk, Vcb);
1774  }
1775 
1776  if (!done) {
1777  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1778 
1779  le2 = Vcb->chunks.Flink;
1780  while (le2 != &Vcb->chunks) {
1781  chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1782 
1783  if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1784  acquire_chunk_lock(c2, Vcb);
1785 
1786  if ((c2->chunk_item->size - c2->used) >= dr->size) {
1787  if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1788  c2->used += dr->size;
1789  space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1790  release_chunk_lock(c2, Vcb);
1791  newchunk = c2;
1792  done = true;
1793  break;
1794  }
1795  }
1796 
1797  release_chunk_lock(c2, Vcb);
1798  }
1799 
1800  le2 = le2->Flink;
1801  }
1802 
1803  // allocate new chunk if necessary
1804  if (!done) {
1805  Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1806 
1807  if (!NT_SUCCESS(Status)) {
1808  ERR("alloc_chunk returned %08x\n", Status);
1809  ExReleaseResourceLite(&Vcb->chunk_lock);
1810  goto end;
1811  }
1812 
1813  acquire_chunk_lock(newchunk, Vcb);
1814 
1815  newchunk->balance_num = Vcb->balance.balance_num;
1816 
1817  if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1818  release_chunk_lock(newchunk, Vcb);
1819  ExReleaseResourceLite(&Vcb->chunk_lock);
1820  ERR("could not find address in new chunk\n");
1822  goto end;
1823  } else {
1824  newchunk->used += dr->size;
1825  space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1826  }
1827 
1828  release_chunk_lock(newchunk, Vcb);
1829  }
1830 
1831  ExReleaseResourceLite(&Vcb->chunk_lock);
1832  }
1833 
1834  dr->newchunk = newchunk;
1835 
1836  bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1837 
1838  bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1839  if (!bmparr) {
1840  ERR("out of memory\n");
1842  goto end;
1843  }
1844 
1845  csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(uint32_t) / Vcb->superblock.sector_size), ALLOC_TAG);
1846  if (!csum) {
1847  ERR("out of memory\n");
1848  ExFreePool(bmparr);
1850  goto end;
1851  }
1852 
1853  RtlInitializeBitMap(&bmp, bmparr, bmplen);
1854  RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1855 
1856  searchkey.obj_id = EXTENT_CSUM_ID;
1857  searchkey.obj_type = TYPE_EXTENT_CSUM;
1858  searchkey.offset = dr->address;
1859 
1860  Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1861  if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1862  ERR("find_item returned %08x\n", Status);
1863  ExFreePool(csum);
1864  ExFreePool(bmparr);
1865  goto end;
1866  }
1867 
1868  if (Status != STATUS_NOT_FOUND) {
1869  do {
1870  traverse_ptr next_tp;
1871 
1872  if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1873  if (tp.item->key.offset >= dr->address + dr->size)
1874  break;
1875  else if (tp.item->size >= sizeof(uint32_t) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= dr->address) {
1876  uint64_t cs = max(dr->address, tp.item->key.offset);
1877  uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)));
1878 
1879  RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1880  tp.item->data + ((cs - tp.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size),
1881  (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size));
1882 
1883  RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1884 
1885  if (ce == dr->address + dr->size)
1886  break;
1887  }
1888  }
1889 
1890  if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1891  tp = next_tp;
1892  else
1893  break;
1894  } while (true);
1895  }
1896 
1897  lastoff = 0;
1898  runlength = RtlFindFirstRunClear(&bmp, &index);
1899 
1900  while (runlength != 0) {
1901  if (index >= bmplen)
1902  break;
1903 
1904  if (index + runlength >= bmplen) {
1905  runlength = bmplen - index;
1906 
1907  if (runlength == 0)
1908  break;
1909  }
1910 
1911  if (index > lastoff) {
1912  ULONG off = lastoff;
1913  ULONG size = index - lastoff;
1914 
1915  // handle no csum run
1916  do {
1917  ULONG rl;
1918 
1919  if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1920  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1921  else
1922  rl = size;
1923 
1924  Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1925  c, NULL, NULL, 0, false, NormalPagePriority);
1926  if (!NT_SUCCESS(Status)) {
1927  ERR("read_data returned %08x\n", Status);
1928  ExFreePool(csum);
1929  ExFreePool(bmparr);
1930  goto end;
1931  }
1932 
1933  Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1934  NULL, newchunk, false, 0, NormalPagePriority);
1935  if (!NT_SUCCESS(Status)) {
1936  ERR("write_data_complete returned %08x\n", Status);
1937  ExFreePool(csum);
1938  ExFreePool(bmparr);
1939  goto end;
1940  }
1941 
1942  size -= rl;
1943  off += rl;
1944  } while (size > 0);
1945  }
1946 
1947  add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1948  add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1949 
1950  // handle csum run
1951  do {
1952  ULONG rl;
1953 
1954  if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1955  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1956  else
1957  rl = runlength;
1958 
1959  Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], false, data,
1960  c, NULL, NULL, 0, false, NormalPagePriority);
1961  if (!NT_SUCCESS(Status)) {
1962  ERR("read_data returned %08x\n", Status);
1963  ExFreePool(csum);
1964  ExFreePool(bmparr);
1965  goto end;
1966  }
1967 
1968  Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1969  NULL, newchunk, false, 0, NormalPagePriority);
1970  if (!NT_SUCCESS(Status)) {
1971  ERR("write_data_complete returned %08x\n", Status);
1972  ExFreePool(csum);
1973  ExFreePool(bmparr);
1974  goto end;
1975  }
1976 
1977  runlength -= rl;
1978  index += rl;
1979  } while (runlength > 0);
1980 
1981  lastoff = index;
1982  runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1983  }
1984 
1985  ExFreePool(csum);
1986  ExFreePool(bmparr);
1987 
1988  // handle final nocsum run
1989  if (lastoff < dr->size / Vcb->superblock.sector_size) {
1990  ULONG off = lastoff;
1991  ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1992 
1993  do {
1994  ULONG rl;
1995 
1996  if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1997  rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1998  else
1999  rl = size;
2000 
2001  Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
2002  c, NULL, NULL, 0, false, NormalPagePriority);
2003  if (!NT_SUCCESS(Status)) {
2004  ERR("read_data returned %08x\n", Status);
2005  goto end;
2006  }
2007 
2008  Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2009  NULL, newchunk, false, 0, NormalPagePriority);
2010  if (!NT_SUCCESS(Status)) {
2011  ERR("write_data_complete returned %08x\n", Status);
2012  goto end;
2013  }
2014 
2015  size -= rl;
2016  off += rl;
2017  } while (size > 0);
2018  }
2019 
2020  le = le->Flink;
2021  }
2022 
2023  ExFreePool(data);
2024  data = NULL;
2025 
2026  Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2027  if (!NT_SUCCESS(Status)) {
2028  ERR("write_metadata_items returned %08x\n", Status);
2029  goto end;
2030  }
2031 
2032  le = items.Flink;
2033  while (le != &items) {
2035 
2037  if (!NT_SUCCESS(Status)) {
2038  ERR("add_data_reloc_extent_item returned %08x\n", Status);
2039  goto end;
2040  }
2041 
2042  le = le->Flink;
2043  }
2044 
2045  le = c->changed_extents.Flink;
2046  while (le != &c->changed_extents) {
2047  LIST_ENTRY *le2, *le3;
2049 
2050  le3 = le->Flink;
2051 
2052  le2 = items.Flink;
2053  while (le2 != &items) {
2055 
2056  if (ce->address == dr->address) {
2057  ce->address = dr->new_address;
2060  break;
2061  }
2062 
2063  le2 = le2->Flink;
2064  }
2065 
2066  le = le3;
2067  }
2068 
2070 
2071  Vcb->need_write = true;
2072 
2073 end:
2074  if (NT_SUCCESS(Status)) {
2075  // update extents in cache inodes before we flush
2076  le = Vcb->chunks.Flink;
2077  while (le != &Vcb->chunks) {
2079 
2080  if (c2->cache) {
2081  LIST_ENTRY* le2;
2082 
2083  ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2084 
2085  le2 = c2->cache->extents.Flink;
2086  while (le2 != &c2->cache->extents) {
2088 
2089  if (!ext->ignore) {
2090  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2091  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2092 
2093  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2094  LIST_ENTRY* le3 = items.Flink;
2095  while (le3 != &items) {
2097 
2098  if (ed2->address == dr->address) {
2099  ed2->address = dr->new_address;
2100  break;
2101  }
2102 
2103  le3 = le3->Flink;
2104  }
2105  }
2106  }
2107  }
2108 
2109  le2 = le2->Flink;
2110  }
2111 
2112  ExReleaseResourceLite(c2->cache->Header.Resource);
2113  }
2114 
2115  le = le->Flink;
2116  }
2117 
2118  Status = do_write(Vcb, NULL);
2119  if (!NT_SUCCESS(Status))
2120  ERR("do_write returned %08x\n", Status);
2121  }
2122 
2123  if (NT_SUCCESS(Status)) {
2125 
2126  // update open FCBs
2127  // FIXME - speed this up(?)
2128 
2129  le = Vcb->all_fcbs.Flink;
2130  while (le != &Vcb->all_fcbs) {
2131  struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2132  LIST_ENTRY* le2;
2133 
2134  ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2135 
2136  le2 = fcb->extents.Flink;
2137  while (le2 != &fcb->extents) {
2139 
2140  if (!ext->ignore) {
2141  if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2142  EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2143 
2144  if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2145  LIST_ENTRY* le3 = items.Flink;
2146  while (le3 != &items) {
2148 
2149  if (ed2->address == dr->address) {
2150  ed2->address = dr->new_address;
2151  break;
2152  }
2153 
2154  le3 = le3->Flink;
2155  }
2156  }
2157  }
2158  }
2159 
2160  le2 = le2->Flink;
2161  }
2162 
2163  ExReleaseResourceLite(fcb->Header.Resource);
2164 
2165  le = le->Flink;
2166  }
2167  } else
2169 
2170  free_trees(Vcb);
2171 
2172  ExReleaseResourceLite(&Vcb->tree_lock);
2173 
2174  if (data)
2175  ExFreePool(data);
2176 
2177  while (!IsListEmpty(&items)) {
2179 
2180  while (!IsListEmpty(&dr->refs)) {
2182 
2183  ExFreePool(ref);
2184  }
2185 
2186  ExFreePool(dr);
2187  }
2188 
2189  while (!IsListEmpty(&metadata_items)) {
2191 
2192  while (!IsListEmpty(&mr->refs)) {
2194 
2195  ExFreePool(ref);
2196  }
2197 
2198  if (mr->data)
2199  ExFreePool(mr->data);
2200 
2201  ExFreePool(mr);
2202  }
2203 
2204  return Status;
2205 }
2206 
2207 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2208  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2209  return BLOCK_FLAG_RAID0;
2210  else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2211  return BLOCK_FLAG_RAID1;
2212  else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2213  return BLOCK_FLAG_DUPLICATE;
2214  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2215  return BLOCK_FLAG_RAID10;
2216  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2217  return BLOCK_FLAG_RAID5;
2218  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2219  return BLOCK_FLAG_RAID6;
2220  else
2221  return BLOCK_FLAG_SINGLE;
2222 }
2223 
2225  btrfs_balance_opts* opts;
2226 
2227  opts = &Vcb->balance.opts[sort];
2228 
2229  if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2230  return false;
2231 
2232  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2234 
2235  if (!(type & opts->profiles))
2236  return false;
2237  }
2238 
2239  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2240  uint16_t i;
2241  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2242  bool b = false;
2243 
2244  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2245  if (cis[i].dev_id == opts->devid) {
2246  b = true;
2247  break;
2248  }
2249  }
2250 
2251  if (!b)
2252  return false;
2253  }
2254 
2255  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2256  uint16_t i, factor;
2257  uint64_t physsize;
2258  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2259  bool b = false;
2260 
2261  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2262  factor = c->chunk_item->num_stripes;
2263  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2264  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2265  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2266  factor = c->chunk_item->num_stripes - 1;
2267  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2268  factor = c->chunk_item->num_stripes - 2;
2269  else // SINGLE, DUPLICATE, RAID1
2270  factor = 1;
2271 
2272  physsize = c->chunk_item->size / factor;
2273 
2274  for (i = 0; i < c->chunk_item->num_stripes; i++) {
2275  if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2276  (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2277  b = true;
2278  break;
2279  }
2280  }
2281 
2282  if (!b)
2283  return false;
2284  }
2285 
2286  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2287  if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2288  return false;
2289  }
2290 
2291  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2292  if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2293  return false;
2294  }
2295 
2296  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2297  uint64_t usage = c->used * 100 / c->chunk_item->size;
2298 
2299  // usage == 0 should mean completely empty, not just that usage rounds to 0%
2300  if (c->used > 0 && usage == 0)
2301  usage = 1;
2302 
2303  if (usage < opts->usage_start || usage > opts->usage_end)
2304  return false;
2305  }
2306 
2309 
2310  if (type == opts->convert)
2311  return false;
2312  }
2313 
2314  return true;
2315 }
2316 
2318  if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2319  args->profiles = opts->profiles;
2321  }
2322 
2323  if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2324  if (args->usage_start == 0) {
2326  args->usage_start = opts->usage_start;
2327  args->usage_end = opts->usage_end;
2328  } else {
2329  args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2330  args->usage = opts->usage_end;
2331  }
2332  }
2333 
2334  if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2335  args->devid = opts->devid;
2336  args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2337  }
2338 
2339  if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2340  args->drange_start = opts->drange_start;
2341  args->drange_end = opts->drange_end;
2342  args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2343  }
2344 
2345  if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2346  args->vrange_start = opts->vrange_start;
2347  args->vrange_end = opts->vrange_end;
2348  args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2349  }
2350 
2351  if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2352  args->convert = opts->convert;
2353  args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2354 
2355  if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2356  args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2357  }
2358 
2359  if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2360  if (args->limit_start == 0) {
2362  args->limit_start = (uint32_t)opts->limit_start;
2363  args->limit_end = (uint32_t)opts->limit_end;
2364  } else {
2365  args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2366  args->limit = opts->limit_end;
2367  }
2368  }
2369 
2370  if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2371  args->stripes_start = opts->stripes_start;
2372  args->stripes_end = opts->stripes_end;
2374  }
2375 }
2376 
2378  KEY searchkey;
2379  traverse_ptr tp;
2380  NTSTATUS Status;
2381  BALANCE_ITEM* bi;
2382 
2383  searchkey.obj_id = BALANCE_ITEM_ID;
2384  searchkey.obj_type = TYPE_TEMP_ITEM;
2385  searchkey.offset = 0;
2386 
2387  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2388 
2389  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2390  if (!NT_SUCCESS(Status)) {
2391  ERR("find_item returned %08x\n", Status);
2392  goto end;
2393  }
2394 
2395  if (!keycmp(tp.item->key, searchkey)) {
2397  if (!NT_SUCCESS(Status)) {
2398  ERR("delete_tree_item returned %08x\n", Status);
2399  goto end;
2400  }
2401  }
2402 
2404  if (!bi) {
2405  ERR("out of memory\n");
2407  goto end;
2408  }
2409 
2410  RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2411 
2412  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2413  bi->flags |= BALANCE_FLAGS_DATA;
2414  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2415  }
2416 
2417  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2419  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2420  }
2421 
2422  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2423  bi->flags |= BALANCE_FLAGS_SYSTEM;
2424  copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2425  }
2426 
2427  Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2428  if (!NT_SUCCESS(Status)) {
2429  ERR("insert_tree_item returned %08x\n", Status);
2430  ExFreePool(bi);
2431  goto end;
2432  }
2433 
2435 
2436 end:
2437  if (NT_SUCCESS(Status)) {
2438  Status = do_write(Vcb, NULL);
2439  if (!NT_SUCCESS(Status))
2440  ERR("do_write returned %08x\n", Status);
2441  }
2442 
2443  free_trees(Vcb);
2444 
2445  ExReleaseResourceLite(&Vcb->tree_lock);
2446 
2447  return Status;
2448 }
2449 
2451  KEY searchkey;
2452  traverse_ptr tp;
2453  NTSTATUS Status;
2454 
2455  searchkey.obj_id = BALANCE_ITEM_ID;
2456  searchkey.obj_type = TYPE_TEMP_ITEM;
2457  searchkey.offset = 0;
2458 
2459  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2460 
2461  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2462  if (!NT_SUCCESS(Status)) {
2463  ERR("find_item returned %08x\n", Status);
2464  goto end;
2465  }
2466 
2467  if (!keycmp(tp.item->key, searchkey)) {
2469  if (!NT_SUCCESS(Status)) {
2470  ERR("delete_tree_item returned %08x\n", Status);
2471  goto end;
2472  }
2473 
2474  Status = do_write(Vcb, NULL);
2475  if (!NT_SUCCESS(Status)) {
2476  ERR("do_write returned %08x\n", Status);
2477  goto end;
2478  }
2479 
2480  free_trees(Vcb);
2481  }
2482 
2484 
2485 end:
2486  ExReleaseResourceLite(&Vcb->tree_lock);
2487 
2488  return Status;
2489 }
2490 
2493 
2494  if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2496  opts->profiles = args->profiles;
2497  }
2498 
2499  if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2501 
2502  opts->usage_start = 0;
2503  opts->usage_end = (uint8_t)args->usage;
2504  } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2506 
2507  opts->usage_start = (uint8_t)args->usage_start;
2508  opts->usage_end = (uint8_t)args->usage_end;
2509  }
2510 
2511  if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2513  opts->devid = args->devid;
2514  }
2515 
2516  if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2518  opts->drange_start = args->drange_start;
2519  opts->drange_end = args->drange_end;
2520  }
2521 
2522  if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2524  opts->vrange_start = args->vrange_start;
2525  opts->vrange_end = args->vrange_end;
2526  }
2527 
2528  if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2530 
2531  opts->limit_start = 0;
2532  opts->limit_end = args->limit;
2533  } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2535 
2536  opts->limit_start = args->limit_start;
2537  opts->limit_end = args->limit_end;
2538  }
2539 
2540  if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2542 
2543  opts->stripes_start = (uint16_t)args->stripes_start;
2544  opts->stripes_end = (uint16_t)args->stripes_end;
2545  }
2546 
2547  if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2549  opts->convert = args->convert;
2550 
2551  if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2552  opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2553  }
2554 }
2555 
2557  NTSTATUS Status;
2558  superblock* sb;
2559  int i = 0;
2560 
2562  if (!sb) {
2563  ERR("out of memory\n");
2565  }
2566 
2567  RtlZeroMemory(sb, sizeof(superblock));
2568 
2569  while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2570  Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2571 
2572  if (!NT_SUCCESS(Status)) {
2573  ExFreePool(sb);
2574  return Status;
2575  }
2576 
2577  i++;
2578  }
2579 
2580  ExFreePool(sb);
2581 
2582  return STATUS_SUCCESS;
2583 }
2584 
2586  KEY searchkey;
2587  traverse_ptr tp;
2588  NTSTATUS Status;
2589  LIST_ENTRY* le;
2591 
2592  if (Vcb->need_write) {
2593  Status = do_write(Vcb, NULL);
2594 
2595  if (!NT_SUCCESS(Status))
2596  ERR("do_write returned %08x\n", Status);
2597  } else
2599 
2600  free_trees(Vcb);
2601 
2602  if (!NT_SUCCESS(Status))
2603  return Status;
2604 
2605  // remove entry in chunk tree
2606 
2607  searchkey.obj_id = 1;
2608  searchkey.obj_type = TYPE_DEV_ITEM;
2609  searchkey.offset = dev->devitem.dev_id;
2610 
2611  Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2612  if (!NT_SUCCESS(Status)) {
2613  ERR("find_item returned %08x\n", Status);
2614  return Status;
2615  }
2616 
2617  if (!keycmp(searchkey, tp.item->key)) {
2619 
2620  if (!NT_SUCCESS(Status)) {
2621  ERR("delete_tree_item returned %08x\n", Status);
2622  return Status;
2623  }
2624  }
2625 
2626  // remove stats entry in device tree
2627 
2628  searchkey.obj_id = 0;
2629  searchkey.obj_type = TYPE_DEV_STATS;
2630  searchkey.offset = dev->devitem.dev_id;
2631 
2632  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2633  if (!NT_SUCCESS(Status)) {
2634  ERR("find_item returned %08x\n", Status);
2635  return Status;
2636  }
2637 
2638  if (!keycmp(searchkey, tp.item->key)) {
2640 
2641  if (!NT_SUCCESS(Status)) {
2642  ERR("delete_tree_item returned %08x\n", Status);
2643  return Status;
2644  }
2645  }
2646 
2647  // update superblock
2648 
2649  Vcb->superblock.num_devices--;
2650  Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2651  Vcb->devices_loaded--;
2652 
2653  RemoveEntryList(&dev->list_entry);
2654 
2655  // flush
2656 
2657  Status = do_write(Vcb, NULL);
2658  if (!NT_SUCCESS(Status))
2659  ERR("do_write returned %08x\n", Status);
2660 
2661  free_trees(Vcb);
2662 
2663  if (!NT_SUCCESS(Status))
2664  return Status;
2665 
2666  if (!dev->readonly && dev->devobj) {
2668  if (!NT_SUCCESS(Status))
2669  WARN("remove_superblocks returned %08x\n", Status);
2670  }
2671 
2672  // remove entry in volume list
2673 
2674  vde = Vcb->vde;
2675 
2676  if (dev->devobj) {
2677  pdo_device_extension* pdode = vde->pdode;
2678 
2680 
2681  le = pdode->children.Flink;
2682  while (le != &pdode->children) {
2684 
2685  if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2688  UNICODE_STRING mmdevpath;
2689 
2690  pdode->children_loaded--;
2691 
2692  if (vc->had_drive_letter) { // re-add entry to mountmgr
2695  if (!NT_SUCCESS(Status))
2696  ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2697  else {
2698  MOUNTDEV_NAME mdn;
2699 
2700  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2702  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2703  else {
2704  MOUNTDEV_NAME* mdn2;
2705  ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2706 
2707  mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2708  if (!mdn2)
2709  ERR("out of memory\n");
2710  else {
2711  Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2712  if (!NT_SUCCESS(Status))
2713  ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2714  else {
2716 
2717  name.Buffer = mdn2->Name;
2718  name.Length = name.MaximumLength = mdn2->NameLength;
2719 
2721  if (!NT_SUCCESS(Status))
2722  WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2723  }
2724 
2725  ExFreePool(mdn2);
2726  }
2727  }
2728 
2730  }
2731  }
2732 
2733  ExFreePool(vc->pnp_name.Buffer);
2735  ExFreePool(vc);
2736 
2738 
2739  break;
2740  }
2741 
2742  le = le->Flink;
2743  }
2744 
2745  if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2746  vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2747 
2748  le = pdode->children.Flink;
2749  while (le != &pdode->children) {
2751 
2752  if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2753  vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2754  break;
2755  }
2756 
2757  le = le->Flink;
2758  }
2759  }
2760 
2761  pdode->num_children = Vcb->superblock.num_devices;
2762 
2764 
2765  // free dev
2766 
2767  if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2769  }
2770 
2771  while (!IsListEmpty(&dev->space)) {
2772  LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2774 
2775  ExFreePool(s);
2776  }
2777 
2778  ExFreePool(dev);
2779 
2780  if (Vcb->trim) {
2781  Vcb->trim = false;
2782 
2783  le = Vcb->devices.Flink;
2784  while (le != &Vcb->devices) {
2785  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2786 
2787  if (dev2->trim) {
2788  Vcb->trim = true;
2789  break;
2790  }
2791 
2792  le = le->Flink;
2793  }
2794  }
2795 
2797 
2798  return STATUS_SUCCESS;
2799 }
2800 
2803  DEVICE_DATA_SET_RANGE* ranges;
2804  ULONG datalen, i;
2805  KEY searchkey;
2806  traverse_ptr tp;
2807  NTSTATUS Status;
2808  bool b;
2809  uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2810  LIST_ENTRY* le;
2811 
2812  dev->num_trim_entries = 0;
2813 
2814  searchkey.obj_id = dev->devitem.dev_id;
2815  searchkey.obj_type = TYPE_DEV_EXTENT;
2816  searchkey.offset = 0;
2817 
2818  Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2819  if (!NT_SUCCESS(Status)) {
2820  ERR("find_item returned %08x\n", Status);
2821  return;
2822  }
2823 
2824  do {
2825  traverse_ptr next_tp;
2826 
2827  if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2828  if (tp.item->size >= sizeof(DEV_EXTENT)) {
2829  DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2830 
2831  if (tp.item->key.offset > lastoff)
2832  add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2833 
2834  lastoff = tp.item->key.offset + de->length;
2835  } else {
2836  ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2837  return;
2838  }
2839  }
2840 
2841  b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2842 
2843  if (b) {
2844  tp = next_tp;
2845  if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2846  break;
2847  }
2848  } while (b);
2849 
2850  if (lastoff < dev->devitem.num_bytes)
2851  add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2852 
2853  if (dev->num_trim_entries == 0)
2854  return;
2855 
2856  datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2857 
2859  if (!dmdsa) {
2860  ERR("out of memory\n");
2861  goto end;
2862  }
2863 
2864  dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2865  dmdsa->Action = DeviceDsmAction_Trim;
2867  dmdsa->ParameterBlockOffset = 0;
2868  dmdsa->ParameterBlockLength = 0;
2870  dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2871 
2872  ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2873 
2874  i = 0;
2875  le = dev->trim_list.Flink;
2876  while (le != &dev->trim_list) {
2878 
2879  ranges[i].StartingOffset = s->address;
2880  ranges[i].LengthInBytes = s->size;
2881  i++;
2882 
2883  le = le->Flink;
2884  }
2885 
2887  if (!NT_SUCCESS(Status))
2888  WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2889 
2890  ExFreePool(dmdsa);
2891 
2892 end:
2893  while (!IsListEmpty(&dev->trim_list)) {
2895  ExFreePool(s);
2896  }
2897 
2898  dev->num_trim_entries = 0;
2899 }
2900 
2902  NTSTATUS Status;
2903  bool changed;
2904  LIST_ENTRY* le;
2905  chunk* rc;
2906 
2907  // FIXME - allow with metadata chunks?
2908 
2909  while (true) {
2910  rc = NULL;
2911 
2912  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2913 
2914  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2915 
2916  // choose the least-used chunk we haven't looked at yet
2917  le = Vcb->chunks.Flink;
2918  while (le != &Vcb->chunks) {
2920 
2921  // FIXME - skip full-size chunks over e.g. 90% full?
2922  if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2923  rc = c;
2924 
2925  le = le->Flink;
2926  }
2927 
2928  ExReleaseResourceLite(&Vcb->chunk_lock);
2929 
2930  if (!rc) {
2931  ExReleaseResourceLite(&Vcb->tree_lock);
2932  break;
2933  }
2934 
2935  if (rc->list_entry_balance.Flink) {
2937  Vcb->balance.chunks_left--;
2938  }
2939 
2940  rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2941  rc->reloc = true;
2942 
2943  ExReleaseResourceLite(&Vcb->tree_lock);
2944 
2945  do {
2946  changed = false;
2947 
2948  Status = balance_data_chunk(Vcb, rc, &changed);
2949  if (!NT_SUCCESS(Status)) {
2950  ERR("balance_data_chunk returned %08x\n", Status);
2951  Vcb->balance.status = Status;
2953  rc->reloc = false;
2954  return Status;
2955  }
2956 
2957  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2958 
2959  if (Vcb->readonly)
2960  Vcb->balance.stopping = true;
2961 
2962  if (Vcb->balance.stopping)
2963  return STATUS_SUCCESS;
2964  } while (changed);
2965 
2967 
2968  rc->changed = true;
2969  rc->space_changed = true;
2970  rc->balance_num = Vcb->balance.balance_num;
2971 
2972  Status = do_write(Vcb, NULL);
2973  if (!NT_SUCCESS(Status)) {
2974  ERR("do_write returned %08x\n", Status);
2975  return Status;
2976  }
2977 
2978  free_trees(Vcb);
2979  }
2980 
2981  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2982 
2983  Status = alloc_chunk(Vcb, flags, &rc, true);
2984 
2985  ExReleaseResourceLite(&Vcb->chunk_lock);
2986 
2987  if (NT_SUCCESS(Status)) {
2988  *newchunk = rc;
2989  return Status;
2990  } else {
2991  ERR("alloc_chunk returned %08x\n", Status);
2992  return Status;
2993  }
2994 }
2995 
2997  LIST_ENTRY* le;
2998 
2999  while (!IsListEmpty(&dev->space)) {
3001 
3002  ExFreePool(s);
3003  }
3004 
3005  // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3006 
3007  space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3008 
3009  le = Vcb->chunks.Flink;
3010  while (le != &Vcb->chunks) {
3011  uint16_t n;
3013  CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3014 
3015  for (n = 0; n < c->chunk_item->num_stripes; n++) {
3016  uint64_t stripe_size = 0;
3017 
3018  if (cis[n].dev_id == dev->devitem.dev_id) {
3019  if (stripe_size == 0) {
3020  uint16_t factor;
3021 
3022  if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3023  factor = c->chunk_item->num_stripes;
3024  else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3025  factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3026  else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3027  factor = c->chunk_item->num_stripes - 1;
3028  else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3029  factor = c->chunk_item->num_stripes - 2;
3030  else // SINGLE, DUP, RAID1
3031  factor = 1;
3032 
3033  stripe_size = c->chunk_item->size / factor;
3034  }
3035 
3036  space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3037  }
3038  }
3039 
3040  le = le->Flink;
3041  }
3042 
3043  return STATUS_SUCCESS;
3044 }
3045 
3046 _Function_class_(KSTART_ROUTINE)
3047 void __stdcall balance_thread(void* context) {
3049  LIST_ENTRY chunks;
3050  LIST_ENTRY* le;
3051  uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3052  uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3053  NTSTATUS Status;
3054 
3055  Vcb->balance.balance_num++;
3056 
3057  Vcb->balance.stopping = false;
3058  KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3059 
3060  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3061  old_data_flags = Vcb->data_flags;
3062  Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3063 
3065  }
3066 
3067  if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3068  old_metadata_flags = Vcb->metadata_flags;
3069  Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3070  }
3071 
3072  if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3073  old_system_flags = Vcb->system_flags;
3074  Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3075  }
3076 
3077  if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3078  if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3079  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3080  else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3081  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3082  }
3083 
3084  num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3085  Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3086 
3087  InitializeListHead(&chunks);
3088 
3089  // FIXME - what are we supposed to do with limit_start?
3090 
3091  if (!Vcb->readonly) {
3092  if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3094  if (!NT_SUCCESS(Status)) {
3095  ERR("add_balance_item returned %08x\n", Status);
3096  Vcb->balance.status = Status;
3097  goto end;
3098  }
3099  } else {
3100  if (Vcb->need_write) {
3101  Status = do_write(Vcb, NULL);
3102 
3103  free_trees(Vcb);
3104 
3105  if (!NT_SUCCESS(Status)) {
3106  ERR("do_write returned %08x\n", Status);
3107  Vcb->balance.status = Status;
3108  goto end;
3109  }
3110  }
3111  }
3112  }
3113 
3114  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3115 
3116  if (Vcb->balance.stopping)
3117  goto end;
3118 
3119  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3120 
3121  le = Vcb->chunks.Flink;
3122  while (le != &Vcb->chunks) {
3124  uint8_t sort;
3125 
3127 
3128  if (c->chunk_item->type & BLOCK_FLAG_DATA)
3130  else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3132  else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3134  else {
3135  ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3137  break;
3138  }
3139 
3140  if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3142  InsertTailList(&chunks, &c->list_entry_balance);
3143 
3144  num_chunks[sort]++;
3145  Vcb->balance.total_chunks++;
3146  Vcb->balance.chunks_left++;
3147  } else if (sort == BALANCE_OPTS_METADATA)
3148  okay_metadata_chunks++;
3149  else if (sort == BALANCE_OPTS_DATA)
3150  okay_data_chunks++;
3151  else if (sort == BALANCE_OPTS_SYSTEM)
3152  okay_system_chunks++;
3153 
3154  if (!c->cache_loaded) {
3156 
3157  if (!NT_SUCCESS(Status)) {
3158  ERR("load_cache_chunk returned %08x\n", Status);
3159  Vcb->balance.status = Status;
3161  ExReleaseResourceLite(&Vcb->chunk_lock);
3162  goto end;
3163  }
3164  }
3165 
3167 
3168  le = le->Flink;
3169  }
3170 
3171  ExReleaseResourceLite(&Vcb->chunk_lock);
3172 
3173  // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3174  if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3175  bool consolidated = false;
3176  chunk* c;
3177 
3178  if (okay_metadata_chunks == 0) {
3179  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3180 
3181  Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3182  if (NT_SUCCESS(Status))
3183  c->balance_num = Vcb->balance.balance_num;
3184  else if (Status != STATUS_DISK_FULL || consolidated) {
3185  ERR("alloc_chunk returned %08x\n", Status);
3186  ExReleaseResourceLite(&Vcb->chunk_lock);
3187  Vcb->balance.status = Status;
3188  goto end;
3189  }
3190 
3191  ExReleaseResourceLite(&Vcb->chunk_lock);
3192 
3193  if (Status == STATUS_DISK_FULL) {
3194  Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3195  if (!NT_SUCCESS(Status)) {
3196  ERR("try_consolidation returned %08x\n", Status);
3197  Vcb->balance.status = Status;
3198  goto end;
3199  } else
3200  c->balance_num = Vcb->balance.balance_num;
3201 
3202  consolidated = true;
3203 
3204  if (Vcb->balance.stopping)
3205  goto end;
3206  }
3207  }
3208 
3209  if (okay_data_chunks == 0) {
3210  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3211 
3212  Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3213  if (NT_SUCCESS(Status))
3214  c->balance_num = Vcb->balance.balance_num;
3215  else if (Status != STATUS_DISK_FULL || consolidated) {
3216  ERR("alloc_chunk returned %08x\n", Status);
3217  ExReleaseResourceLite(&Vcb->chunk_lock);
3218  Vcb->balance.status = Status;
3219  goto end;
3220  }
3221 
3222  ExReleaseResourceLite(&Vcb->chunk_lock);
3223 
3224  if (Status == STATUS_DISK_FULL) {
3225  Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3226  if (!NT_SUCCESS(Status)) {
3227  ERR("try_consolidation returned %08x\n", Status);
3228  Vcb->balance.status = Status;
3229  goto end;
3230  } else
3231  c->balance_num = Vcb->balance.balance_num;
3232 
3233  consolidated = true;
3234 
3235  if (Vcb->balance.stopping)
3236  goto end;
3237  }
3238  }
3239 
3240  if (okay_system_chunks == 0) {
3241  ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3242 
3243  Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3244  if (NT_SUCCESS(Status))
3245  c->balance_num = Vcb->balance.balance_num;
3246  else if (Status != STATUS_DISK_FULL || consolidated) {
3247  ERR("alloc_chunk returned %08x\n", Status);
3248  ExReleaseResourceLite(&Vcb->chunk_lock);
3249  Vcb->balance.status = Status;
3250  goto end;
3251  }
3252 
3253  ExReleaseResourceLite(&Vcb->chunk_lock);
3254 
3255  if (Status == STATUS_DISK_FULL) {
3256  Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3257  if (!NT_SUCCESS(Status)) {
3258  ERR("try_consolidation returned %08x\n", Status);
3259  Vcb->balance.status = Status;
3260  goto end;
3261  } else
3262  c->balance_num = Vcb->balance.balance_num;
3263 
3264  consolidated = true;
3265 
3266  if (Vcb->balance.stopping)
3267  goto end;
3268  }
3269  }
3270  }
3271 
3272  ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3273 
3274  le = chunks.Flink;
3275  while (le != &chunks) {
3276  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3277 
3278  c->reloc = true;
3279 
3280  le = le->Flink;
3281  }
3282 
3283  ExReleaseResourceLite(&Vcb->chunk_lock);
3284 
3285  // do data chunks before metadata
3286  le = chunks.Flink;
3287  while (le != &chunks) {
3288  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3289  LIST_ENTRY* le2 = le->Flink;
3290 
3291  if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3292  bool changed;
3293 
3294  do {
3295  changed = false;
3296 
3297  Status = balance_data_chunk(Vcb, c, &changed);
3298  if (!NT_SUCCESS(Status)) {
3299  ERR("balance_data_chunk returned %08x\n", Status);
3300  Vcb->balance.status = Status;
3301  goto end;
3302  }
3303 
3304  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3305 
3306  if (Vcb->readonly)
3307  Vcb->balance.stopping = true;
3308 
3309  if (Vcb->balance.stopping)
3310  break;
3311  } while (changed);
3312 
3313  c->changed = true;
3314  c->space_changed = true;
3315  }
3316 
3317  if (Vcb->balance.stopping)
3318  goto end;
3319 
3320  if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3321  (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3322  RemoveEntryList(&c->list_entry_balance);
3323  c->list_entry_balance.Flink = NULL;
3324 
3325  Vcb->balance.chunks_left--;
3326  }
3327 
3328  le = le2;
3329  }
3330 
3331  // do metadata chunks
3332  while (!IsListEmpty(&chunks)) {
3333  chunk* c;
3334  bool changed;
3335 
3336  le = RemoveHeadList(&chunks);
3337  c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3338 
3339  if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3340  do {
3341  Status = balance_metadata_chunk(Vcb, c, &changed);
3342  if (!NT_SUCCESS(Status)) {
3343  ERR("balance_metadata_chunk returned %08x\n", Status);
3344  Vcb->balance.status = Status;
3345  goto end;
3346  }
3347 
3348  KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3349 
3350  if (Vcb->readonly)
3351  Vcb->balance.stopping = true;
3352 
3353  if (Vcb->balance.stopping)
3354  break;
3355  } while (changed);
3356 
3357  c->changed = true;
3358  c->space_changed = true;
3359  }
3360 
3361  if (Vcb->balance.stopping)
3362  break;
3363 
3364  c->list_entry_balance.Flink = NULL;
3365 
3366  Vcb->balance.chunks_left--;
3367  }
3368 
3369 end:
3370  if (!Vcb->readonly) {
3371  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3372  le = chunks.Flink;
3373  while (le != &chunks) {
3374  chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3375  c->reloc = false;
3376 
3377  le = le->Flink;
3378  c->list_entry_balance.Flink = NULL;
3379  }
3380 
3381  if (old_data_flags != 0)
3382  Vcb->data_flags = old_data_flags;
3383 
3384  if (old_metadata_flags != 0)
3385  Vcb->metadata_flags = old_metadata_flags;
3386 
3387  if (old_system_flags != 0)
3388  Vcb->system_flags = old_system_flags;
3389  }
3390 
3391  if (Vcb->balance.removing) {
3392  device* dev = NULL;
3393 
3394  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3395 
3396  le = Vcb->devices.Flink;
3397  while (le != &Vcb->devices) {
3398  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3399 
3400  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3401  dev = dev2;
3402  break;
3403  }
3404 
3405  le = le->Flink;
3406  }
3407 
3408  if (dev) {
3409  if (Vcb->balance.chunks_left == 0) {
3411 
3412  if (!NT_SUCCESS(Status)) {
3413  ERR("finish_removing_device returned %08x\n", Status);
3414  dev->reloc = false;
3415  }
3416  } else
3417  dev->reloc = false;
3418  }
3419 
3420  ExReleaseResourceLite(&Vcb->tree_lock);
3421  } else if (Vcb->balance.shrinking) {
3422  device* dev = NULL;
3423 
3424  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3425 
3426  le = Vcb->devices.Flink;
3427  while (le != &Vcb->devices) {
3428  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3429 
3430  if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3431  dev = dev2;
3432  break;
3433  }
3434 
3435  le = le->Flink;
3436  }
3437 
3438  if (!dev) {
3439  ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3440  Vcb->balance.status = STATUS_INTERNAL_ERROR;
3441  }
3442 
3443  if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3444  if (dev) {
3446  if (!NT_SUCCESS(Status))
3447  WARN("regenerate_space_list returned %08x\n", Status);
3448  }
3449  } else {
3450  uint64_t old_size;
3451 
3452  old_size = dev->devitem.num_bytes;
3453  dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3454 
3456  if (!NT_SUCCESS(Status)) {
3457  ERR("update_dev_item returned %08x\n", Status);
3458  dev->devitem.num_bytes = old_size;
3459  Vcb->balance.status = Status;
3460 
3462  if (!NT_SUCCESS(Status))
3463  WARN("regenerate_space_list returned %08x\n", Status);
3464  } else {
3465  Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3466 
3467  Status = do_write(Vcb, NULL);
3468  if (!NT_SUCCESS(Status))
3469  ERR("do_write returned %08x\n", Status);
3470 
3471  free_trees(Vcb);
3472  }
3473  }
3474 
3475  ExReleaseResourceLite(&Vcb->tree_lock);
3476 
3477  if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3479  } else {
3481  if (!NT_SUCCESS(Status)) {
3482  ERR("remove_balance_item returned %08x\n", Status);
3483  goto end;
3484  }
3485  }
3486 
3487  if (Vcb->trim && !Vcb->options.no_trim) {
3488  ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3489 
3490  le = Vcb->devices.Flink;
3491  while (le != &Vcb->devices) {
3492  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3493 
3494  if (dev2->devobj && !dev2->readonly && dev2->trim)
3495  trim_unalloc_space(Vcb, dev2);
3496 
3497  le = le->Flink;
3498  }
3499 
3500  ExReleaseResourceLite(&Vcb->tree_lock);
3501  }
3502  }
3503 
3504  ZwClose(Vcb->balance.thread);
3505  Vcb->balance.thread = NULL;
3506 
3507  KeSetEvent(&Vcb->balance.finished, 0, false);
3508 }
3509 
3511  NTSTATUS Status;
3513  OBJECT_ATTRIBUTES oa;
3514  uint8_t i;
3515 
3516  if (length < sizeof(btrfs_start_balance) || !data)
3517  return STATUS_INVALID_PARAMETER;
3518 
3519  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3521 
3522  if (Vcb->locked) {
3523  WARN("cannot start balance while locked\n");
3524  return STATUS_DEVICE_NOT_READY;
3525  }
3526 
3527  if (Vcb->scrub.thread) {
3528  WARN("cannot start balance while scrub running\n");
3529  return STATUS_DEVICE_NOT_READY;
3530  }
3531 
3532  if (Vcb->balance.thread) {
3533  WARN("balance already running\n");
3534  return STATUS_DEVICE_NOT_READY;
3535  }
3536 
3537  if (Vcb->readonly)
3539 
3543  return STATUS_SUCCESS;
3544 
3545  for (i = 0; i < 3; i++) {
3546  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3547  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3550 
3551  if (bsb->opts[i].profiles == 0)
3552  return STATUS_INVALID_PARAMETER;
3553  }
3554 
3555  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3556  if (bsb->opts[i].devid == 0)
3557  return STATUS_INVALID_PARAMETER;
3558  }
3559 
3560  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3561  if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3562  return STATUS_INVALID_PARAMETER;
3563  }
3564 
3565  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3566  if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3567  return STATUS_INVALID_PARAMETER;
3568  }
3569 
3570  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3571  bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3572  bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3573 
3574  if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3575  return STATUS_INVALID_PARAMETER;
3576  }
3577 
3578  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3579  bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3580  bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3581 
3582  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3583  return STATUS_INVALID_PARAMETER;
3584  }
3585 
3586  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3587  bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3588  bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3589 
3590  if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3591  return STATUS_INVALID_PARAMETER;
3592  }
3593 
3594  if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3595  if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3597  bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3598  bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3599  return STATUS_INVALID_PARAMETER;
3600  }
3601  }
3602  }
3603 
3604  RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3607 
3608  Vcb->balance.paused = false;
3609  Vcb->balance.removing = false;
3610  Vcb->balance.shrinking = false;
3611  Vcb->balance.status = STATUS_SUCCESS;
3612  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3613 
3615 
3616  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3617  if (!NT_SUCCESS(Status)) {
3618  ERR("PsCreateSystemThread returned %08x\n", Status);
3619  return Status;
3620  }
3621 
3622  return STATUS_SUCCESS;
3623 }
3624 
3626  KEY searchkey;
3627  traverse_ptr tp;
3628  NTSTATUS Status;
3629  BALANCE_ITEM* bi;
3630  OBJECT_ATTRIBUTES oa;
3631  int i;
3632 
3633  searchkey.obj_id = BALANCE_ITEM_ID;
3634  searchkey.obj_type = TYPE_TEMP_ITEM;
3635  searchkey.offset = 0;
3636 
3637  Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3638  if (!NT_SUCCESS(Status)) {
3639  ERR("find_item returned %08x\n", Status);
3640  return Status;
3641  }
3642 
3643  if (keycmp(tp.item->key, searchkey)) {
3644  TRACE("no balance item found\n");
3645  return STATUS_NOT_FOUND;
3646  }
3647 
3648  if (tp.item->size < sizeof(BALANCE_ITEM)) {
3649  WARN("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3650  tp.item->size, sizeof(BALANCE_ITEM));
3651  return STATUS_INTERNAL_ERROR;
3652  }
3653 
3654  bi = (BALANCE_ITEM*)tp.item->data;
3655 
3656  if (bi->flags & BALANCE_FLAGS_DATA)
3657  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3658 
3659  if (bi->flags & BALANCE_FLAGS_METADATA)
3660  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3661 
3662  if (bi->flags & BALANCE_FLAGS_SYSTEM)
3663  load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3664 
3665  // do the heuristics that Linux driver does
3666 
3667  for (i = 0; i < 3; i++) {
3668  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3669  // if converting, don't redo chunks already done
3670 
3671  if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3672  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3673 
3674  // don't balance chunks more than 90% filled - presumably these
3675  // have already been done
3676 
3677  if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3678  !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3679  ) {
3680  Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3681  Vcb->balance.opts[i].usage_start = 0;
3682  Vcb->balance.opts[i].usage_end = 90;
3683  }
3684  }
3685  }
3686 
3687  if (Vcb->readonly || Vcb->options.skip_balance)
3688  Vcb->balance.paused = true;
3689  else
3690  Vcb->balance.paused = false;
3691 
3692  Vcb->balance.removing = false;
3693  Vcb->balance.shrinking = false;
3694  Vcb->balance.status = STATUS_SUCCESS;
3695  KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3696 
3698 
3699  Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3700  if (!NT_SUCCESS(Status)) {
3701  ERR("PsCreateSystemThread returned %08x\n", Status);
3702  return Status;
3703  }
3704 
3705  return STATUS_SUCCESS;
3706 }
3707 
3710 
3711  if (length < sizeof(btrfs_query_balance) || !data)
3712  return STATUS_INVALID_PARAMETER;
3713 
3714  if (!Vcb->balance.thread) {
3716 
3717  if (!NT_SUCCESS(Vcb->balance.status)) {
3718  bqb->status |= BTRFS_BALANCE_ERROR;
3719  bqb->error = Vcb->balance.status;
3720  }
3721 
3722  return STATUS_SUCCESS;
3723  }
3724 
3725  bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3726 
3727  if (Vcb->balance.removing)
3728  bqb->status |= BTRFS_BALANCE_REMOVAL;
3729 
3730  if (Vcb->balance.shrinking)
3732 
3733  if (!NT_SUCCESS(Vcb->balance.status))
3734  bqb->status |= BTRFS_BALANCE_ERROR;
3735 
3736  bqb->chunks_left = Vcb->balance.chunks_left;
3737  bqb->total_chunks = Vcb->balance.total_chunks;
3738  bqb->error = Vcb->balance.status;
3739  RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3740  RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3741  RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3742 
3743  return STATUS_SUCCESS;
3744 }
3745 
3747  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3749 
3750  if (!Vcb->balance.thread)
3751  return STATUS_DEVICE_NOT_READY;
3752 
3753  if (Vcb->balance.paused)
3754  return STATUS_DEVICE_NOT_READY;
3755 
3756  Vcb->balance.paused = true;
3757  KeClearEvent(&Vcb->balance.event);
3758 
3759  return STATUS_SUCCESS;
3760 }
3761 
3763  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3765 
3766  if (!Vcb->balance.thread)
3767  return STATUS_DEVICE_NOT_READY;
3768 
3769  if (!Vcb->balance.paused)
3770  return STATUS_DEVICE_NOT_READY;
3771 
3772  if (Vcb->readonly)
3774 
3775  Vcb->balance.paused = false;
3776  KeSetEvent(&Vcb->balance.event, 0, false);
3777 
3778  return STATUS_SUCCESS;
3779 }
3780 
3782  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3784 
3785  if (!Vcb->balance.thread)
3786  return STATUS_DEVICE_NOT_READY;
3787 
3788  Vcb->balance.paused = false;
3789  Vcb->balance.stopping = true;
3790  Vcb->balance.status = STATUS_SUCCESS;
3791  KeSetEvent(&Vcb->balance.event, 0, false);
3792 
3793  return STATUS_SUCCESS;
3794 }
3795 
3797  uint64_t devid;
3798  LIST_ENTRY* le;
3799  device* dev = NULL;
3800  NTSTATUS Status;
3801  int i;
3802  uint64_t num_rw_devices;
3803  OBJECT_ATTRIBUTES oa;
3804 
3805  TRACE("(%p, %p, %x)\n", Vcb, data, length);
3806 
3807  if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3809 
3810  if (length < sizeof(uint64_t))
3811  return STATUS_INVALID_PARAMETER;
3812 
3813  devid = *(uint64_t*)data;
3814 
3815  ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3816 
3817  if (Vcb->readonly) {
3818  ExReleaseResourceLite(&Vcb->tree_lock);
3820  }
3821 
3822  num_rw_devices = 0;
3823 
3824  le = Vcb->devices.Flink;
3825  while (le != &Vcb->devices) {
3826  device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3827 
3828  if (dev2->devitem.dev_id == devid)
3829  dev = dev2;
3830 
3831  if (!dev2->readonly)
3832  num_rw_devices++;
3833 
3834  le = le->Flink;
3835  }
3836 
3837  if (!dev) {
3838  ExReleaseResourceLite(&Vcb->tree_lock);
3839  WARN("device %I64x not found\n", devid);
3840  return STATUS_NOT_FOUND;
3841  }
3842 
3843  if (!dev->readonly) {
3844  if (num_rw_devices == 1) {
3845  ExReleaseResourceLite(&Vcb->tree_lock);
3846  WARN("not removing last non-readonly device\n");
3847  return STATUS_INVALID_PARAMETER;
3848  }
3849 
3850  if (num_rw_devices == 4 &&
3851  ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3852  (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3853  ) {
3854  ExReleaseResourceLite(&Vcb->tree_lock);
3855  ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3856  return STATUS_CANNOT_DELETE;
3857  }
3858 
3859  if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3860  ExReleaseResourceLite(&Vcb->tree_lock);
3861  ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3862  return STATUS_CANNOT_DELETE;
3863  }
3864 
3865  if (num_rw_devices == 2 &&
3866  ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3867  (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3868  ) {
3869  ExReleaseResourceLite(&Vcb->tree_lock);