node

Node is an in-memory serialized representation of a page; B+ tree operations, page splitting, page rebalancing, and so on are mapped through Node

type node struct {
	// The Bucket to which the current node (page) belongs is equivalent to a Table in a relational database
	bucket     *Bucket
	// Whether the current node is a leaf node
	isLeaf     bool
	// Whether to rebalance the current node
	unbalanced bool
	// Whether the current node is split
	spilled    bool
	key        []byte //node.inodes[0].key Specifies the initial key of the child node, the first key of each page (the smallest one).
	// The id of the current page
	pgid       pgid 
	// The secondary node of the current node
	parent     *node
	// Children of the current node
	children   nodes
	// K/V data stored by the current node (page) (if leaf node)
	inodes     inodes
}
Copy the code

inode

The inode represents an actual node within a node, which can actually point to an element (a K/V pair) or to a K/V pair that has not yet been written to the page.

type inode struct {
	flags uint32 BucketLeafFlag = bucketLeafFlag = bucketLeafFlag
	pgid  pgid   // The current corresponding page_id is 0 if the page has not been written yet
	key   []byte // The actual key
	value []byte // The actual value
}
Copy the code

BoltDB data form B+ tree through pages, all data are stored in leaf nodes, and node is the serialized representation of page in memory, so for the operation of B+ tree, B+ tree operations are performed by locating a specific page, serializing the page to a node, and performing operations on related nodes. Therefore, Node supports the following operations:

// Returns the root node of the current B+ tree
func (n *node) root(a) *node {
	if n.parent == nil {
		return n
	}
	return n.parent.root()
}
Copy the code

// Returns the minimum number of keys a page should have. Non-leaf nodes need at least two keys
func (n *node) minKeys(a) int {
	if n.isLeaf {
		return 1
	}
	return 2
}
Copy the code

// Returns the current node(page) size
// The size of the page is equal to pageHeader + object header * Number of objects + actual key + actual value
// https://juejin.cn/post/6942016731019739166
func (n *node) size(a) int {
	sz, elsz := pageHeaderSize, n.pageElementSize()
	for i := 0; i < len(n.inodes); i++ {
		item := &n.inodes[i]
		sz += elsz + len(item.key) + len(item.value)
	}
	return sz
}
Copy the code

// Return the node (page) corresponding to the i-th child of the current node.
// The data in the page is sorted in ascending order by key
func (n *node) childAt(index int) *node {
	if n.isLeaf {
		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
	}
	return n.bucket.node(n.inodes[index].pgid, n)
}
Copy the code

// Returns the right and left siblings of the current node
func (n *node) nextSibling(a) *node {
	if n.parent == nil {
		return nil
	}
	index := n.parent.childIndex(n)
	if index >= n.parent.numChildren()- 1 {
		return nil
	}
	return n.parent.childAt(index + 1)}func (n *node) prevSibling(a) *node {
	if n.parent == nil {
		return nil
	}
	index := n.parent.childIndex(n)
	if index == 0 {
		return nil
	}
	return n.parent.childAt(index - 1)}Copy the code

// Put writes a key/value to a node. If olekey=newkey, replace value.
// Otherwise insert a new inode
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
	if pgid >= n.bucket.tx.meta.pgid {
		// Todo this judgment
		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
	} 
	// Find where newkey is inserted
	index := sort.Search(len(n.inodes), func(i int) bool { returnbytes.Compare(n.inodes[i].key, oldKey) ! =- 1 })
	// Check if it is a replacement insert
	exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)
	if! exact {// If the key does not exist, insert a new node
		n.inodes = append(n.inodes, inode{})
		copy(n.inodes[index+1:], n.inodes[index:])
	}
	inode := &n.inodes[index]
	inode.flags = flags // Assign related elements
	inode.key = newKey 
	inode.value = value
	inode.pgid = pgid
}
Copy the code

func (n *node) del(key []byte) {
	// Find the first index not greater than key by key
	index := sort.Search(len(n.inodes), func(i int) bool { returnbytes.Compare(n.inodes[i].key, key) ! =- 1 })
	// Returns if the current key does not exist
	if index >= len(n.inodes) || ! bytes.Equal(n.inodes[index].key, key) {return
	}
	// Delete related ninodes if they exist
	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...).// The node rebalance is not being rebalanced.
	Rebalance is triggered when an element is deleted
	n.unbalanced = true
}
Copy the code

//read deserializes the contents of a page into an in-memory node
func (n *node) read(p *page){ n.pgid = p.id n.isLeaf = (p.flags & leafPageFlag) ! =0
	n.inodes = make(inodes, int(p.count)) // Allocate the corresponding inode space according to the number of page elements
	for i := 0; i < int(p.count); i++ {
		// Deserialize the data corresponding to pageElement into the inode
		inode := &n.inodes[i]
		if n.isLeaf {
			elem := p.leafPageElement(uint16(i))
			inode.flags = elem.flags
			inode.key = elem.key()
			inode.value = elem.value()
		} else {
			elem := p.branchPageElement(uint16(i))
			inode.pgid = elem.pgid
			inode.key = elem.key()
		}
	}
	// If the current node has at least one inode, node.key equals the key of the first inode
	if len(n.inodes) > 0 {
		n.key = n.inodes[0].key
	} else {
		n.key = nil}}Copy the code

//write Writes the node item to one or more pages
// If the current node cannot hold a page, multiple consecutive pages will be allocated to store the current node
func (n *node) write(p *page) {
	if n.isLeaf {
		// Flag the page based on whether it is currently a leaf node
		p.flags |= leafPageFlag
	} else {
		p.flags |= branchPageFlag
	}
	// A maximum of 65535 inodes
	if len(n.inodes) >= 0xFFFF {
		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)".len(n.inodes), p.id))
	}
	p.count = uint16(len(n.inodes))
	// If the current node contains no elements, return
	if p.count == 0 {
		return
	}
	// Find the write offset in page according to pageElementSize*lengthOfInode
	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
	for i, item := range n.inodes {
		/ / to traverse the node of the inode, writes it to the page, write pageHeader (leafElement/branchElement)
		if n.isLeaf {
			elem := p.leafPageElement(uint16(i))
			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0-))uintptr(unsafe.Pointer(elem)))
			elem.flags = item.flags
			elem.ksize = uint32(len(item.key))
			elem.vsize = uint32(len(item.value))
		} else {
			elem := p.branchPageElement(uint16(i))
			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0-))uintptr(unsafe.Pointer(elem)))
			elem.ksize = uint32(len(item.key))
			elem.pgid = item.pgid
		}
		// If the amount of memory allocated is smaller than the actual length of key+value, it needs to be reallocated
		klen, vlen := len(item.key), len(item.value
		if len(b) < klen+vlen {
			b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0[:]]))}// Write the actual key+value to the page
		copy(b[0:], item.key)
		b = b[klen:]
		copy(b[0:], item.value)
		b = b[vlen:]
	}
}
Copy the code

// Node split in B+ tree
//split Splits the specified node into multiple nodes based on the pageSize passed in
func (n *node) split(pageSize int)[] *node {
	var nodes []*node
	node := n
	for {
		// Split the current node into nodes A and B
		a, b := node.splitTwo(pageSize)
		nodes = append(nodes, a)
		if b == nil {
			break
		}
		// Recursively split b until b is nil, then all nodes are no larger than pageSize
		node = b
	}
	return nodes
}
Copy the code

//splitTwo splits the given node into two smaller nodes according to the pageSize size
func (n *node) splitTwo(pageSize int) (*node, *node) {
	// If no split adjustment is reached, return directly, no split
	// The split condition is that there should be at least two inodes for leaf nodes and at least one inode for non-leaf nodes, and the node size should not be smaller than pageSize
	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
		return n, nil
	}
	var fillPercent = n.bucket.FillPercent
	if fillPercent < minFillPercent {
		fillPercent = minFillPercent
	} else if fillPercent > maxFillPercent {
		fillPercent = maxFillPercent
	}
	// Determine the split threshold based on the fill factor
	threshold := int(float64(pageSize) * fillPercent)
	// Split the inodes array into two parts according to the inodes index
	splitIndex, _ := n.splitIndex(threshold)
    // If the current node does not have a parent node, create one
	if n.parent == nil {
		n.parent = &node{bucket: n.bucket, children: []*node{n}}
	}
	// Create a new node, add the split node to the children of the current parent node, the new node is not allocated pages, so pageID=0
	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
	n.parent.children = append(n.parent.children, next)
    // Split the inodes of the current node into two parts according to splitIndex
	next.inodes = n.inodes[splitIndex:]
	n.inodes = n.inodes[:splitIndex]
	n.bucket.tx.stats.Split++

	return n, next
}
Copy the code

//splitIndex Calculates the index of the inodes array that should be split based on the size of the threshold passed in
func (n *node) splitIndex(threshold int) (index, sz int) {
	sz = pageHeaderSize / / the size of the pageHeader
	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
		index = i
		inode := n.inodes[i]
		// The actual Size of each inode contains three parts: elementheader. Size+key.Size+value
		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
		if i >= minKeysPerPage && sz+elsize > threshold {
			// The total size of the previous I inodes is greater than the threshold and split from the current index
			break
		}
		sz += elsize
	}

	return
}
Copy the code

//spill recursively writes all nodes (leaf nodes) contained in a node to a dirty page, splitting nodes in the process.
// An error is returned if memory for dirty pages cannot be allocated
//spill describes the splitting process of b+ tree during the transaction commit. Since b+ tree pages are represented in memory by Node, page splitting is mapped by Node
func (n *node) spill(a) error {
	var tx = n.bucket.tx
	if n.spilled { // Returns if the current node is being split; A page can be split only once. Multiple split operations cannot be performed in parallel
		return nil
	}
	// Sort the current leaf nodes by key
	sort.Sort(n.children)
	// Split the leaf nodes recursively
	for i := 0; i < len(n.children); i++ {
		iferr := n.children[i].spill(); err ! =nil {
			return err
		}
	}
	// The leaf node list is only used to track node splits
	n.children = nil
	// Split the current node into multiple nodes according to pageSize
	var nodes = n.split(tx.db.pageSize)
	for _, node := range nodes {
		if node.pgid > 0 {
			// If there is an old page, after split, the old page should be released and returned to the freelist
			// The node has not been serialized to the corresponding page, so the corresponding page.id=0
			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
			node.pgid = 0
		}
		// Create a new page to store the current node, and return the page if the allocation fails
		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
		iferr ! =nil {
			return err
		}
		// if the assigned pageId is greater than the hidden pageId, then todo?? Tx.mit pageID change)
		if p.id >= tx.meta.pgid {
			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
		}
		// Deserialize Node to the newly shard page
		node.pgid = p.id
		node.write(p)
		node.spilled = true
		// If the current node's parent node is not empty, write the current node's key(or the first inode's key) to the parent node.
		ifnode.parent ! =nil {
			var key = node.key
			if key == nil {
				key = node.inodes[0].key
			}
			// Update if present, insert otherwise
			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
			node.key = node.inodes[0].key
		}
		tx.stats.Spill++
	}
	// If the root node is split, create a root node and split it again
	ifn.parent ! =nil && n.parent.pgid == 0 {
		// Clear the current leaf node information to ensure that it will not be split again
		n.children = nil
		return n.parent.spill()
	}
	return nil
}
Copy the code

Rebalance combine siblings into one node if their size is smaller than the fill threshold or the number of inodes is smaller than the minimum
func (n *node) rebalance(a) {
	if! n.unbalanced {rebalance
		return
	}
	n.unbalanced = false rebalance
	// Update statistics
	n.bucket.tx.stats.Rebalance++
	// Calculate the fill threshold, which is one-fourth of the page size
	var threshold = n.bucket.tx.db.pageSize / 4
	// A page should be at least a quarter of a page in size and have at least 2 (leaf nodes) or 1 (non-leaf nodes) keys
	if n.size() > threshold && len(n.inodes) > n.minKeys() {
		return
	}
	// The current node is the root node, special treatment
	if n.parent == nil {
		// The current node is a branch node and there is only one inode to split the current node
		if! n.isLeaf &&len(n.inodes) == 1 {
			// Move the leaf node of the root node up
			// Create a new child node with the current node as the root node
			child := n.bucket.node(n.inodes[0].pgid, n)
			n.isLeaf = child.isLeaf
			n.inodes = child.inodes[:]
			n.children = child.children
			// Recalculate the parent //todo of the current leaf node
			for _, inode := range n.inodes {
				// The current node is cached in the bucket
				if child, ok := n.bucket.nodes[inode.pgid]; ok {
					child.parent = n
				}
			}
			// Delete the old leaf node
			child.parent = nil
			delete(n.bucket.nodes, child.pgid)
			child.free()
		}

		return
	}
	// If the current node does not store the key, remove the current node
	if n.numChildren() == 0 {
		// If the current node has no leaf node, parallel size
		// Remove the current node
		n.parent.del(n.key) // Remove the current node from parent-inodes
		n.parent.removeChild(n) // Remove the current array from parent. Children
		delete(n.bucket.nodes, n.pgid)
		n.free() // Release the page corresponding to the current node
		n.parent.rebalance() Rebalance the parent node recursively
		return
	}
	// The current node has data
	var target *node // Locate the node where the rebalance is needed
	var useNextSibling = n.parent.childIndex(n) == 0 // Find the position of the current node in the parent node, is the leftmost node
	if useNextSibling {// The current node is the leftmost node
		// The sibling node on the right
		target = n.nextSibling() // Find the sibling to the right of the current node
	} else {
		// Left sibling node
		target = n.prevSibling()
	}
	// If both the current and target nodes are too small, merge them
	if useNextSibling {// If the target node is a sibling of the current node, merge the target node into the current node.
		// Iterate over the target element and recalculate its parent
		for _, inode := range target.inodes {
			// If the current bucket caches the page corresponding to the inode's pageID,
			if child, ok := n.bucket.nodes[inode.pgid]; ok {
				// Remove the child node from its parent
				child.parent.removeChild(child)
				child.parent = n // Recalculate its parent to the current node
				// Add child to the child of the current node
				child.parent.children = append(child.parent.children, child)  // ==> n.children = append(n.children, child) equality ?}}// Add the element of the target node to the element array of the current node
		n.inodes = append(n.inodes, target.inodes...)
		n.parent.del(target.key) // Remove the target key from the parent (target and n have the same parent)
		n.parent.removeChild(target) // Remove the target node from the parent leaf node of the target node
		delete(n.bucket.nodes, target.pgid) // Remove the target node from the node cache of the current bucket
		target.free() // Release the page occupied by the target node
	} else {
		// If the target node is the left sibling of the current node, merge the current node to the left sibling
		for _, inode := range n.inodes {
			if child, ok := n.bucket.nodes[inode.pgid]; ok {
				child.parent.removeChild(child)
				child.parent = target
				child.parent.children = append(child.parent.children, child)
			}
		}
		// Remove the current node's parent and current bucket from the cache, and add the current node's element to the sibling on the left
		target.inodes = append(target.inodes, n.inodes...) // The inodes are sorted by key and added to the destination node
		n.parent.del(n.key)
		n.parent.removeChild(n)
		delete(n.bucket.nodes, n.pgid)
		n.free()
	}
    // Recursive merge parent node
	n.parent.rebalance()
}
Copy the code

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

Boltdb node,

node

inode

Boltdb node,

node

inode

Related Posts

Hackathon 1024 Hackathon 1024

Introduction to Data Flow

A little guy’s journey to deploy openstack using DevStack