node

Node is an in-memory serialized representation of a page; B+ tree operations, page splitting, page rebalancing, and so on are mapped through Node

type node struct {
	// The Bucket to which the current node (page) belongs is equivalent to a Table in a relational database
	bucket     *Bucket
	// Whether the current node is a leaf node
	isLeaf     bool
	// Whether to rebalance the current node
	unbalanced bool
	// Whether the current node is split
	spilled    bool
	key        []byte //node.inodes[0].key Specifies the initial key of the child node, the first key of each page (the smallest one).
	// The id of the current page
	pgid       pgid 
	// The secondary node of the current node
	parent     *node
	// Children of the current node
	children   nodes
	// K/V data stored by the current node (page) (if leaf node)
	inodes     inodes
}
Copy the code

inode

The inode represents an actual node within a node, which can actually point to an element (a K/V pair) or to a K/V pair that has not yet been written to the page.

type inode struct {
	flags uint32 BucketLeafFlag = bucketLeafFlag = bucketLeafFlag
	pgid  pgid   // The current corresponding page_id is 0 if the page has not been written yet
	key   []byte // The actual key
	value []byte // The actual value
}
Copy the code

BoltDB data form B+ tree through pages, all data are stored in leaf nodes, and node is the serialized representation of page in memory, so for the operation of B+ tree, B+ tree operations are performed by locating a specific page, serializing the page to a node, and performing operations on related nodes. Therefore, Node supports the following operations:

// Returns the root node of the current B+ tree
func (n *node) root(a) *node {
	if n.parent == nil {
		return n
	}
	return n.parent.root()
}
Copy the code
// Returns the minimum number of keys a page should have. Non-leaf nodes need at least two keys
func (n *node) minKeys(a) int {
	if n.isLeaf {
		return 1
	}
	return 2
}
Copy the code
// Returns the current node(page) size
// The size of the page is equal to pageHeader + object header * Number of objects + actual key + actual value
// https://juejin.cn/post/6942016731019739166
func (n *node) size(a) int {
	sz, elsz := pageHeaderSize, n.pageElementSize()
	for i := 0; i < len(n.inodes); i++ {
		item := &n.inodes[i]
		sz += elsz + len(item.key) + len(item.value)
	}
	return sz
}
Copy the code
// Return the node (page) corresponding to the i-th child of the current node.
// The data in the page is sorted in ascending order by key
func (n *node) childAt(index int) *node {
	if n.isLeaf {
		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
	}
	return n.bucket.node(n.inodes[index].pgid, n)
}
Copy the code
// Returns the right and left siblings of the current node
func (n *node) nextSibling(a) *node {
	if n.parent == nil {
		return nil
	}
	index := n.parent.childIndex(n)
	if index >= n.parent.numChildren()- 1 {
		return nil
	}
	return n.parent.childAt(index + 1)}func (n *node) prevSibling(a) *node {
	if n.parent == nil {
		return nil
	}
	index := n.parent.childIndex(n)
	if index == 0 {
		return nil
	}
	return n.parent.childAt(index - 1)}Copy the code
// Put writes a key/value to a node. If olekey=newkey, replace value.
// Otherwise insert a new inode
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
	if pgid >= n.bucket.tx.meta.pgid {
		// Todo this judgment
		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
	} 
	// Find where newkey is inserted
	index := sort.Search(len(n.inodes), func(i int) bool { returnbytes.Compare(n.inodes[i].key, oldKey) ! =- 1 })
	// Check if it is a replacement insert
	exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)
	if! exact {// If the key does not exist, insert a new node
		n.inodes = append(n.inodes, inode{})
		copy(n.inodes[index+1:], n.inodes[index:])
	}
	inode := &n.inodes[index]
	inode.flags = flags // Assign related elements
	inode.key = newKey 
	inode.value = value
	inode.pgid = pgid
}
Copy the code
func (n *node) del(key []byte) {
	// Find the first index not greater than key by key
	index := sort.Search(len(n.inodes), func(i int) bool { returnbytes.Compare(n.inodes[i].key, key) ! =- 1 })
	// Returns if the current key does not exist
	if index >= len(n.inodes) || ! bytes.Equal(n.inodes[index].key, key) {return
	}
	// Delete related ninodes if they exist
	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...).// The node rebalance is not being rebalanced.
	Rebalance is triggered when an element is deleted
	n.unbalanced = true
}
Copy the code
//read deserializes the contents of a page into an in-memory node
func (n *node) read(p *page){ n.pgid = p.id n.isLeaf = (p.flags & leafPageFlag) ! =0
	n.inodes = make(inodes, int(p.count)) // Allocate the corresponding inode space according to the number of page elements
	for i := 0; i < int(p.count); i++ {
		// Deserialize the data corresponding to pageElement into the inode
		inode := &n.inodes[i]
		if n.isLeaf {
			elem := p.leafPageElement(uint16(i))
			inode.flags = elem.flags
			inode.key = elem.key()
			inode.value = elem.value()
		} else {
			elem := p.branchPageElement(uint16(i))
			inode.pgid = elem.pgid
			inode.key = elem.key()
		}
	}
	// If the current node has at least one inode, node.key equals the key of the first inode
	if len(n.inodes) > 0 {
		n.key = n.inodes[0].key
	} else {
		n.key = nil}}Copy the code
//write Writes the node item to one or more pages
// If the current node cannot hold a page, multiple consecutive pages will be allocated to store the current node
func (n *node) write(p *page) {
	if n.isLeaf {
		// Flag the page based on whether it is currently a leaf node
		p.flags |= leafPageFlag
	} else {
		p.flags |= branchPageFlag
	}
	// A maximum of 65535 inodes
	if len(n.inodes) >= 0xFFFF {
		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)".len(n.inodes), p.id))
	}
	p.count = uint16(len(n.inodes))
	// If the current node contains no elements, return
	if p.count == 0 {
		return
	}
	// Find the write offset in page according to pageElementSize*lengthOfInode
	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
	for i, item := range n.inodes {
		/ / to traverse the node of the inode, writes it to the page, write pageHeader (leafElement/branchElement)
		if n.isLeaf {
			elem := p.leafPageElement(uint16(i))
			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0-))uintptr(unsafe.Pointer(elem)))
			elem.flags = item.flags
			elem.ksize = uint32(len(item.key))
			elem.vsize = uint32(len(item.value))
		} else {
			elem := p.branchPageElement(uint16(i))
			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0-))uintptr(unsafe.Pointer(elem)))
			elem.ksize = uint32(len(item.key))
			elem.pgid = item.pgid
		}
		// If the amount of memory allocated is smaller than the actual length of key+value, it needs to be reallocated
		klen, vlen := len(item.key), len(item.value
		if len(b) < klen+vlen {
			b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0[:]]))}// Write the actual key+value to the page
		copy(b[0:], item.key)
		b = b[klen:]
		copy(b[0:], item.value)
		b = b[vlen:]
	}
}
Copy the code
// Node split in B+ tree
//split Splits the specified node into multiple nodes based on the pageSize passed in
func (n *node) split(pageSize int)[] *node {
	var nodes []*node
	node := n
	for {
		// Split the current node into nodes A and B
		a, b := node.splitTwo(pageSize)
		nodes = append(nodes, a)
		if b == nil {
			break
		}
		// Recursively split b until b is nil, then all nodes are no larger than pageSize
		node = b
	}
	return nodes
}
Copy the code
//splitTwo splits the given node into two smaller nodes according to the pageSize size
func (n *node) splitTwo(pageSize int) (*node, *node) {
	// If no split adjustment is reached, return directly, no split
	// The split condition is that there should be at least two inodes for leaf nodes and at least one inode for non-leaf nodes, and the node size should not be smaller than pageSize
	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
		return n, nil
	}
	var fillPercent = n.bucket.FillPercent
	if fillPercent < minFillPercent {
		fillPercent = minFillPercent
	} else if fillPercent > maxFillPercent {
		fillPercent = maxFillPercent
	}
	// Determine the split threshold based on the fill factor
	threshold := int(float64(pageSize) * fillPercent)
	// Split the inodes array into two parts according to the inodes index
	splitIndex, _ := n.splitIndex(threshold)
    // If the current node does not have a parent node, create one
	if n.parent == nil {
		n.parent = &node{bucket: n.bucket, children: []*node{n}}
	}
	// Create a new node, add the split node to the children of the current parent node, the new node is not allocated pages, so pageID=0
	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
	n.parent.children = append(n.parent.children, next)
    // Split the inodes of the current node into two parts according to splitIndex
	next.inodes = n.inodes[splitIndex:]
	n.inodes = n.inodes[:splitIndex]
	n.bucket.tx.stats.Split++

	return n, next
}
Copy the code
//splitIndex Calculates the index of the inodes array that should be split based on the size of the threshold passed in
func (n *node) splitIndex(threshold int) (index, sz int) {
	sz = pageHeaderSize / / the size of the pageHeader
	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
		index = i
		inode := n.inodes[i]
		// The actual Size of each inode contains three parts: elementheader. Size+key.Size+value
		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
		if i >= minKeysPerPage && sz+elsize > threshold {
			// The total size of the previous I inodes is greater than the threshold and split from the current index
			break
		}
		sz += elsize
	}

	return
}
Copy the code
//spill recursively writes all nodes (leaf nodes) contained in a node to a dirty page, splitting nodes in the process.
// An error is returned if memory for dirty pages cannot be allocated
//spill describes the splitting process of b+ tree during the transaction commit. Since b+ tree pages are represented in memory by Node, page splitting is mapped by Node
func (n *node) spill(a) error {
	var tx = n.bucket.tx
	if n.spilled { // Returns if the current node is being split; A page can be split only once. Multiple split operations cannot be performed in parallel
		return nil
	}
	// Sort the current leaf nodes by key
	sort.Sort(n.children)
	// Split the leaf nodes recursively
	for i := 0; i < len(n.children); i++ {
		iferr := n.children[i].spill(); err ! =nil {
			return err
		}
	}
	// The leaf node list is only used to track node splits
	n.children = nil
	// Split the current node into multiple nodes according to pageSize
	var nodes = n.split(tx.db.pageSize)
	for _, node := range nodes {
		if node.pgid > 0 {
			// If there is an old page, after split, the old page should be released and returned to the freelist
			// The node has not been serialized to the corresponding page, so the corresponding page.id=0
			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
			node.pgid = 0
		}
		// Create a new page to store the current node, and return the page if the allocation fails
		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
		iferr ! =nil {
			return err
		}
		// if the assigned pageId is greater than the hidden pageId, then todo?? Tx.mit pageID change)
		if p.id >= tx.meta.pgid {
			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
		}
		// Deserialize Node to the newly shard page
		node.pgid = p.id
		node.write(p)
		node.spilled = true
		// If the current node's parent node is not empty, write the current node's key(or the first inode's key) to the parent node.
		ifnode.parent ! =nil {
			var key = node.key
			if key == nil {
				key = node.inodes[0].key
			}
			// Update if present, insert otherwise
			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
			node.key = node.inodes[0].key
		}
		tx.stats.Spill++
	}
	// If the root node is split, create a root node and split it again
	ifn.parent ! =nil && n.parent.pgid == 0 {
		// Clear the current leaf node information to ensure that it will not be split again
		n.children = nil
		return n.parent.spill()
	}
	return nil
}
Copy the code
Rebalance combine siblings into one node if their size is smaller than the fill threshold or the number of inodes is smaller than the minimum
func (n *node) rebalance(a) {
	if! n.unbalanced {rebalance
		return
	}
	n.unbalanced = false rebalance
	// Update statistics
	n.bucket.tx.stats.Rebalance++
	// Calculate the fill threshold, which is one-fourth of the page size
	var threshold = n.bucket.tx.db.pageSize / 4
	// A page should be at least a quarter of a page in size and have at least 2 (leaf nodes) or 1 (non-leaf nodes) keys
	if n.size() > threshold && len(n.inodes) > n.minKeys() {
		return
	}
	// The current node is the root node, special treatment
	if n.parent == nil {
		// The current node is a branch node and there is only one inode to split the current node
		if! n.isLeaf &&len(n.inodes) == 1 {
			// Move the leaf node of the root node up
			// Create a new child node with the current node as the root node
			child := n.bucket.node(n.inodes[0].pgid, n)
			n.isLeaf = child.isLeaf
			n.inodes = child.inodes[:]
			n.children = child.children
			// Recalculate the parent //todo of the current leaf node
			for _, inode := range n.inodes {
				// The current node is cached in the bucket
				if child, ok := n.bucket.nodes[inode.pgid]; ok {
					child.parent = n
				}
			}
			// Delete the old leaf node
			child.parent = nil
			delete(n.bucket.nodes, child.pgid)
			child.free()
		}

		return
	}
	// If the current node does not store the key, remove the current node
	if n.numChildren() == 0 {
		// If the current node has no leaf node, parallel size
		// Remove the current node
		n.parent.del(n.key) // Remove the current node from parent-inodes
		n.parent.removeChild(n) // Remove the current array from parent. Children
		delete(n.bucket.nodes, n.pgid)
		n.free() // Release the page corresponding to the current node
		n.parent.rebalance() Rebalance the parent node recursively
		return
	}
	// The current node has data
	var target *node // Locate the node where the rebalance is needed
	var useNextSibling = n.parent.childIndex(n) == 0 // Find the position of the current node in the parent node, is the leftmost node
	if useNextSibling {// The current node is the leftmost node
		// The sibling node on the right
		target = n.nextSibling() // Find the sibling to the right of the current node
	} else {
		// Left sibling node
		target = n.prevSibling()
	}
	// If both the current and target nodes are too small, merge them
	if useNextSibling {// If the target node is a sibling of the current node, merge the target node into the current node.
		// Iterate over the target element and recalculate its parent
		for _, inode := range target.inodes {
			// If the current bucket caches the page corresponding to the inode's pageID,
			if child, ok := n.bucket.nodes[inode.pgid]; ok {
				// Remove the child node from its parent
				child.parent.removeChild(child)
				child.parent = n // Recalculate its parent to the current node
				// Add child to the child of the current node
				child.parent.children = append(child.parent.children, child)  // ==> n.children = append(n.children, child) equality ?}}// Add the element of the target node to the element array of the current node
		n.inodes = append(n.inodes, target.inodes...)
		n.parent.del(target.key) // Remove the target key from the parent (target and n have the same parent)
		n.parent.removeChild(target) // Remove the target node from the parent leaf node of the target node
		delete(n.bucket.nodes, target.pgid) // Remove the target node from the node cache of the current bucket
		target.free() // Release the page occupied by the target node
	} else {
		// If the target node is the left sibling of the current node, merge the current node to the left sibling
		for _, inode := range n.inodes {
			if child, ok := n.bucket.nodes[inode.pgid]; ok {
				child.parent.removeChild(child)
				child.parent = target
				child.parent.children = append(child.parent.children, child)
			}
		}
		// Remove the current node's parent and current bucket from the cache, and add the current node's element to the sibling on the left
		target.inodes = append(target.inodes, n.inodes...) // The inodes are sorted by key and added to the destination node
		n.parent.del(n.key)
		n.parent.removeChild(n)
		delete(n.bucket.nodes, n.pgid)
		n.free()
	}
    // Recursive merge parent node
	n.parent.rebalance()
}
Copy the code