diff options
author | Péter Szilágyi <peterke@gmail.com> | 2018-06-21 17:28:05 +0800 |
---|---|---|
committer | Felix Lange <fjl@users.noreply.github.com> | 2018-06-21 17:28:05 +0800 |
commit | d926bf2c7e3182d694c15829a37a0ca7331cd03c (patch) | |
tree | c2d3ddd85941a231fb05de46c36703273d11814a /trie | |
parent | 8db8d074e2fff547e9d85169018e03f89b5975a1 (diff) | |
download | dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.gz dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.tar.zst dexon-d926bf2c7e3182d694c15829a37a0ca7331cd03c.zip |
trie: cache collapsed tries node, not rlp blobs (#16876)
The current trie memory database/cache that we do pruning on stores
trie nodes as binary rlp encoded blobs, and also stores the node
relationships/references for GC purposes. However, most of the trie
nodes (everything apart from a value node) is in essence just a
collection of references.
This PR switches out the RLP encoded trie blobs with the
collapsed-but-not-serialized trie nodes. This permits most of the
references to be recovered from within the node data structure,
avoiding the need to track them a second time (expensive memory wise).
Diffstat (limited to 'trie')
-rw-r--r-- | trie/database.go | 274 | ||||
-rw-r--r-- | trie/hasher.go | 30 | ||||
-rw-r--r-- | trie/node.go | 15 | ||||
-rw-r--r-- | trie/trie.go | 8 |
4 files changed, 260 insertions, 67 deletions
diff --git a/trie/database.go b/trie/database.go index 468c139df..88c6e9cd6 100644 --- a/trie/database.go +++ b/trie/database.go @@ -17,6 +17,8 @@ package trie import ( + "fmt" + "io" "sync" "time" @@ -24,6 +26,7 @@ import ( "github.com/ethereum/go-ethereum/ethdb" "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/metrics" + "github.com/ethereum/go-ethereum/rlp" ) var ( @@ -82,25 +85,188 @@ type Database struct { lock sync.RWMutex } +// rawNode is a simple binary blob used to differentiate between collapsed trie +// nodes and already encoded RLP binary blobs (while at the same time store them +// in the same cache fields). +type rawNode []byte + +func (n rawNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } +func (n rawNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } +func (n rawNode) fstring(ind string) string { panic("this should never end up in a live trie") } + +// rawFullNode represents only the useful data content of a full node, with the +// caches and flags stripped out to minimize its data storage. This type honors +// the same RLP encoding as the original parent. +type rawFullNode [17]node + +func (n rawFullNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } +func (n rawFullNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } +func (n rawFullNode) fstring(ind string) string { panic("this should never end up in a live trie") } + +func (n rawFullNode) EncodeRLP(w io.Writer) error { + var nodes [17]node + + for i, child := range n { + if child != nil { + nodes[i] = child + } else { + nodes[i] = nilValueNode + } + } + return rlp.Encode(w, nodes) +} + +// rawShortNode represents only the useful data content of a short node, with the +// caches and flags stripped out to minimize its data storage. This type honors +// the same RLP encoding as the original parent. +type rawShortNode struct { + Key []byte + Val node +} + +func (n rawShortNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } +func (n rawShortNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } +func (n rawShortNode) fstring(ind string) string { panic("this should never end up in a live trie") } + // cachedNode is all the information we know about a single cached node in the // memory database write layer. type cachedNode struct { - blob []byte // Cached data block of the trie node - parents int // Number of live nodes referencing this one - children map[common.Hash]int // Children referenced by this nodes + node node // Cached collapsed trie node, or raw rlp data + size uint16 // Byte size of the useful cached data + + parents uint16 // Number of live nodes referencing this one + children map[common.Hash]uint16 // External children referenced by this node flushPrev common.Hash // Previous node in the flush-list flushNext common.Hash // Next node in the flush-list } +// rlp returns the raw rlp encoded blob of the cached node, either directly from +// the cache, or by regenerating it from the collapsed node. +func (n *cachedNode) rlp() []byte { + if node, ok := n.node.(rawNode); ok { + return node + } + blob, err := rlp.EncodeToBytes(n.node) + if err != nil { + panic(err) + } + return blob +} + +// obj returns the decoded and expanded trie node, either directly from the cache, +// or by regenerating it from the rlp encoded blob. +func (n *cachedNode) obj(hash common.Hash, cachegen uint16) node { + if node, ok := n.node.(rawNode); ok { + return mustDecodeNode(hash[:], node, cachegen) + } + return expandNode(hash[:], n.node, cachegen) +} + +// childs returns all the tracked children of this node, both the implicit ones +// from inside the node as well as the explicit ones from outside the node. +func (n *cachedNode) childs() []common.Hash { + children := make([]common.Hash, 0, 16) + for child := range n.children { + children = append(children, child) + } + if _, ok := n.node.(rawNode); !ok { + gatherChildren(n.node, &children) + } + return children +} + +// gatherChildren traverses the node hierarchy of a collapsed storage node and +// retrieves all the hashnode children. +func gatherChildren(n node, children *[]common.Hash) { + switch n := n.(type) { + case *rawShortNode: + gatherChildren(n.Val, children) + + case rawFullNode: + for i := 0; i < 16; i++ { + gatherChildren(n[i], children) + } + case hashNode: + *children = append(*children, common.BytesToHash(n)) + + case valueNode, nil: + + default: + panic(fmt.Sprintf("unknown node type: %T", n)) + } +} + +// simplifyNode traverses the hierarchy of an expanded memory node and discards +// all the internal caches, returning a node that only contains the raw data. +func simplifyNode(n node) node { + switch n := n.(type) { + case *shortNode: + // Short nodes discard the flags and cascade + return &rawShortNode{Key: n.Key, Val: simplifyNode(n.Val)} + + case *fullNode: + // Full nodes discard the flags and cascade + node := rawFullNode(n.Children) + for i := 0; i < len(node); i++ { + if node[i] != nil { + node[i] = simplifyNode(node[i]) + } + } + return node + + case valueNode, hashNode, rawNode: + return n + + default: + panic(fmt.Sprintf("unknown node type: %T", n)) + } +} + +// expandNode traverses the node hierarchy of a collapsed storage node and converts +// all fields and keys into expanded memory form. +func expandNode(hash hashNode, n node, cachegen uint16) node { + switch n := n.(type) { + case *rawShortNode: + // Short nodes need key and child expansion + return &shortNode{ + Key: compactToHex(n.Key), + Val: expandNode(nil, n.Val, cachegen), + flags: nodeFlag{ + hash: hash, + gen: cachegen, + }, + } + + case rawFullNode: + // Full nodes need child expansion + node := &fullNode{ + flags: nodeFlag{ + hash: hash, + gen: cachegen, + }, + } + for i := 0; i < len(node.Children); i++ { + if n[i] != nil { + node.Children[i] = expandNode(nil, n[i], cachegen) + } + } + return node + + case valueNode, hashNode: + return n + + default: + panic(fmt.Sprintf("unknown node type: %T", n)) + } +} + // NewDatabase creates a new trie database to store ephemeral trie content before // its written out to disk or garbage collected. func NewDatabase(diskdb ethdb.Database) *Database { return &Database{ - diskdb: diskdb, - nodes: map[common.Hash]*cachedNode{ - {}: {children: make(map[common.Hash]int)}, - }, + diskdb: diskdb, + nodes: map[common.Hash]*cachedNode{{}: {}}, preimages: make(map[common.Hash][]byte), } } @@ -110,33 +276,46 @@ func (db *Database) DiskDB() DatabaseReader { return db.diskdb } -// Insert writes a new trie node to the memory database if it's yet unknown. The -// method will make a copy of the slice. -func (db *Database) Insert(hash common.Hash, blob []byte) { +// InsertBlob writes a new reference tracked blob to the memory database if it's +// yet unknown. This method should only be used for non-trie nodes that require +// reference counting, since trie nodes are garbage collected directly through +// their embedded children. +func (db *Database) InsertBlob(hash common.Hash, blob []byte) { db.lock.Lock() defer db.lock.Unlock() - db.insert(hash, blob) + db.insert(hash, blob, rawNode(blob)) } -// insert is the private locked version of Insert. -func (db *Database) insert(hash common.Hash, blob []byte) { +// insert inserts a collapsed trie node into the memory database. This method is +// a more generic version of InsertBlob, supporting both raw blob insertions as +// well ex trie node insertions. The blob must always be specified to allow proper +// size tracking. +func (db *Database) insert(hash common.Hash, blob []byte, node node) { // If the node's already cached, skip if _, ok := db.nodes[hash]; ok { return } - db.nodes[hash] = &cachedNode{ - blob: common.CopyBytes(blob), - children: make(map[common.Hash]int), + // Create the cached entry for this node + entry := &cachedNode{ + node: simplifyNode(node), + size: uint16(len(blob)), flushPrev: db.newest, } + for _, child := range entry.childs() { + if c := db.nodes[child]; c != nil { + c.parents++ + } + } + db.nodes[hash] = entry + // Update the flush-list endpoints if db.oldest == (common.Hash{}) { db.oldest, db.newest = hash, hash } else { db.nodes[db.newest].flushNext, db.newest = hash, hash } - db.nodesSize += common.StorageSize(common.HashLength + len(blob)) + db.nodesSize += common.StorageSize(common.HashLength + entry.size) } // insertPreimage writes a new trie node pre-image to the memory database if it's @@ -151,8 +330,27 @@ func (db *Database) insertPreimage(hash common.Hash, preimage []byte) { db.preimagesSize += common.StorageSize(common.HashLength + len(preimage)) } -// Node retrieves a cached trie node from memory. If it cannot be found cached, -// the method queries the persistent database for the content. +// node retrieves a cached trie node from memory, or returns nil if none can be +// found in the memory cache. +func (db *Database) node(hash common.Hash, cachegen uint16) node { + // Retrieve the node from cache if available + db.lock.RLock() + node := db.nodes[hash] + db.lock.RUnlock() + + if node != nil { + return node.obj(hash, cachegen) + } + // Content unavailable in memory, attempt to retrieve from disk + enc, err := db.diskdb.Get(hash[:]) + if err != nil || enc == nil { + return nil + } + return mustDecodeNode(hash[:], enc, cachegen) +} + +// Node retrieves an encoded cached trie node from memory. If it cannot be found +// cached, the method queries the persistent database for the content. func (db *Database) Node(hash common.Hash) ([]byte, error) { // Retrieve the node from cache if available db.lock.RLock() @@ -160,7 +358,7 @@ func (db *Database) Node(hash common.Hash) ([]byte, error) { db.lock.RUnlock() if node != nil { - return node.blob, nil + return node.rlp(), nil } // Content unavailable in memory, attempt to retrieve from disk return db.diskdb.Get(hash[:]) @@ -222,20 +420,22 @@ func (db *Database) reference(child common.Hash, parent common.Hash) { return } // If the reference already exists, only duplicate for roots - if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) { + if db.nodes[parent].children == nil { + db.nodes[parent].children = make(map[common.Hash]uint16) + } else if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) { return } node.parents++ db.nodes[parent].children[child]++ } -// Dereference removes an existing reference from a parent node to a child node. -func (db *Database) Dereference(child common.Hash, parent common.Hash) { +// Dereference removes an existing reference from a root node. +func (db *Database) Dereference(root common.Hash) { db.lock.Lock() defer db.lock.Unlock() nodes, storage, start := len(db.nodes), db.nodesSize, time.Now() - db.dereference(child, parent) + db.dereference(root, common.Hash{}) db.gcnodes += uint64(nodes - len(db.nodes)) db.gcsize += storage - db.nodesSize @@ -254,9 +454,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) { // Dereference the parent-child node := db.nodes[parent] - node.children[child]-- - if node.children[child] == 0 { - delete(node.children, child) + if node.children != nil && node.children[child] > 0 { + node.children[child]-- + if node.children[child] == 0 { + delete(node.children, child) + } } // If the child does not exist, it's a previously committed node. node, ok := db.nodes[child] @@ -274,11 +476,11 @@ func (db *Database) dereference(child common.Hash, parent common.Hash) { db.nodes[node.flushNext].flushPrev = node.flushPrev } // Dereference all children and delete the node - for hash := range node.children { + for _, hash := range node.childs() { db.dereference(hash, child) } delete(db.nodes, child) - db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) + db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) } } @@ -323,7 +525,7 @@ func (db *Database) Cap(limit common.StorageSize) error { for size > limit && oldest != (common.Hash{}) { // Fetch the oldest referenced node and push into the batch node := db.nodes[oldest] - if err := batch.Put(oldest[:], node.blob); err != nil { + if err := batch.Put(oldest[:], node.rlp()); err != nil { db.lock.RUnlock() return err } @@ -340,7 +542,7 @@ func (db *Database) Cap(limit common.StorageSize) error { // is the total size, including both the useful cached data (hash -> blob), as // well as the flushlist metadata (2*hash). When flushing items from the cache, // we need to reduce both. - size -= common.StorageSize(3*common.HashLength + len(node.blob)) + size -= common.StorageSize(3*common.HashLength + int(node.size)) oldest = node.flushNext } // Flush out any remainder data from the last batch @@ -364,7 +566,7 @@ func (db *Database) Cap(limit common.StorageSize) error { delete(db.nodes, db.oldest) db.oldest = node.flushNext - db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) + db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) } if db.oldest != (common.Hash{}) { db.nodes[db.oldest].flushPrev = common.Hash{} @@ -460,12 +662,12 @@ func (db *Database) commit(hash common.Hash, batch ethdb.Batch) error { if !ok { return nil } - for child := range node.children { + for _, child := range node.childs() { if err := db.commit(child, batch); err != nil { return err } } - if err := batch.Put(hash[:], node.blob); err != nil { + if err := batch.Put(hash[:], node.rlp()); err != nil { return err } // If we've reached an optimal batch size, commit and start over @@ -496,11 +698,11 @@ func (db *Database) uncache(hash common.Hash) { db.nodes[node.flushNext].flushPrev = node.flushPrev } // Uncache the node's subtries and remove the node itself too - for child := range node.children { + for _, child := range node.childs() { db.uncache(child) } delete(db.nodes, hash) - db.nodesSize -= common.StorageSize(common.HashLength + len(node.blob)) + db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) } // Size returns the current storage size of the memory cache in front of the diff --git a/trie/hasher.go b/trie/hasher.go index 47c6dd8f9..7b1d7793f 100644 --- a/trie/hasher.go +++ b/trie/hasher.go @@ -137,9 +137,6 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) { return original, original, err } } - if collapsed.Val == nil { - collapsed.Val = valueNode(nil) // Ensure that nil children are encoded as empty strings. - } return collapsed, cached, nil case *fullNode: @@ -152,14 +149,9 @@ func (h *hasher) hashChildren(original node, db *Database) (node, node, error) { if err != nil { return original, original, err } - } else { - collapsed.Children[i] = valueNode(nil) // Ensure that nil children are encoded as empty strings. } } cached.Children[16] = n.Children[16] - if collapsed.Children[16] == nil { - collapsed.Children[16] = valueNode(nil) - } return collapsed, cached, nil default: @@ -192,34 +184,22 @@ func (h *hasher) store(n node, db *Database, force bool) (node, error) { if db != nil { // We are pooling the trie nodes into an intermediate memory cache - db.lock.Lock() hash := common.BytesToHash(hash) - db.insert(hash, h.tmp) - // Track all direct parent->child node references - switch n := n.(type) { - case *shortNode: - if child, ok := n.Val.(hashNode); ok { - db.reference(common.BytesToHash(child), hash) - } - case *fullNode: - for i := 0; i < 16; i++ { - if child, ok := n.Children[i].(hashNode); ok { - db.reference(common.BytesToHash(child), hash) - } - } - } + + db.lock.Lock() + db.insert(hash, h.tmp, n) db.lock.Unlock() // Track external references from account->storage trie if h.onleaf != nil { switch n := n.(type) { case *shortNode: - if child, ok := n.Val.(valueNode); ok && child != nil { + if child, ok := n.Val.(valueNode); ok { h.onleaf(child, hash) } case *fullNode: for i := 0; i < 16; i++ { - if child, ok := n.Children[i].(valueNode); ok && child != nil { + if child, ok := n.Children[i].(valueNode); ok { h.onleaf(child, hash) } } diff --git a/trie/node.go b/trie/node.go index 02815042c..a06f1b389 100644 --- a/trie/node.go +++ b/trie/node.go @@ -47,9 +47,22 @@ type ( valueNode []byte ) +// nilValueNode is used when collapsing internal trie nodes for hashing, since +// unset children need to serialize correctly. +var nilValueNode = valueNode(nil) + // EncodeRLP encodes a full node into the consensus RLP format. func (n *fullNode) EncodeRLP(w io.Writer) error { - return rlp.Encode(w, n.Children) + var nodes [17]node + + for i, child := range n.Children { + if child != nil { + nodes[i] = child + } else { + nodes[i] = nilValueNode + } + } + return rlp.Encode(w, nodes) } func (n *fullNode) copy() *fullNode { copy := *n; return © } diff --git a/trie/trie.go b/trie/trie.go index 30543c549..4284e30ad 100644 --- a/trie/trie.go +++ b/trie/trie.go @@ -433,12 +433,10 @@ func (t *Trie) resolveHash(n hashNode, prefix []byte) (node, error) { cacheMissCounter.Inc(1) hash := common.BytesToHash(n) - - enc, err := t.db.Node(hash) - if err != nil || enc == nil { - return nil, &MissingNodeError{NodeHash: hash, Path: prefix} + if node := t.db.node(hash, t.cachegen); node != nil { + return node, nil } - return mustDecodeNode(n, enc, t.cachegen), nil + return nil, &MissingNodeError{NodeHash: hash, Path: prefix} } // Root returns the root hash of the trie. |