/* $NetBSD: rbtdb.c,v 1.4.4.2 2024/02/29 12:34:33 martin Exp $ */ /* * Copyright (C) Internet Systems Consortium, Inc. ("ISC") * * SPDX-License-Identifier: MPL-2.0 * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, you can obtain one at https://mozilla.org/MPL/2.0/. * * See the COPYRIGHT file distributed with this work for additional * information regarding copyright ownership. */ /*! \file */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "rbtdb.h" #define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4') #define CHECK(op) \ do { \ result = (op); \ if (result != ISC_R_SUCCESS) \ goto failure; \ } while (0) /*% * Note that "impmagic" is not the first four bytes of the struct, so * ISC_MAGIC_VALID cannot be used. */ #define VALID_RBTDB(rbtdb) \ ((rbtdb) != NULL && (rbtdb)->common.impmagic == RBTDB_MAGIC) typedef uint32_t rbtdb_serial_t; typedef uint32_t rbtdb_rdatatype_t; #define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF)) #define RBTDB_RDATATYPE_EXT(type) ((dns_rdatatype_t)((type) >> 16)) #define RBTDB_RDATATYPE_VALUE(base, ext) \ ((rbtdb_rdatatype_t)(((uint32_t)ext) << 16) | \ (((uint32_t)base) & 0xffff)) #define RBTDB_RDATATYPE_SIGNSEC \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec) #define RBTDB_RDATATYPE_SIGNSEC3 \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3) #define RBTDB_RDATATYPE_SIGNS \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns) #define RBTDB_RDATATYPE_SIGCNAME \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname) #define RBTDB_RDATATYPE_SIGDNAME \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname) #define RBTDB_RDATATYPE_SIGDS \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds) #define RBTDB_RDATATYPE_SIGSOA \ RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_soa) #define RBTDB_RDATATYPE_NCACHEANY RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any) #define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) #define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l) #define RBTDB_LOCK(l, t) RWLOCK((l), (t)) #define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t)) /* * Since node locking is sensitive to both performance and memory footprint, * we need some trick here. If we have both high-performance rwlock and * high performance and small-memory reference counters, we use rwlock for * node lock and isc_refcount for node references. In this case, we don't have * to protect the access to the counters by locks. * Otherwise, we simply use ordinary mutex lock for node locking, and use * simple integers as reference counters which is protected by the lock. * In most cases, we can simply use wrapper macros such as NODE_LOCK and * NODE_UNLOCK. In some other cases, however, we need to protect reference * counters first and then protect other parts of a node as read-only data. * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also * provided for these special cases. When we can use the efficient backend * routines, we should only protect the "other members" by NODE_WEAKLOCK(read). * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical * section including the access to the reference counter. * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected * section is also protected by NODE_STRONGLOCK(). */ typedef isc_rwlock_t nodelock_t; #define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0) #define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l) #define NODE_LOCK(l, t) RWLOCK((l), (t)) #define NODE_UNLOCK(l, t) RWUNLOCK((l), (t)) #define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l) #define NODE_DOWNGRADE(l) isc_rwlock_downgrade(l) /*% * Whether to rate-limit updating the LRU to avoid possible thread contention. * Updating LRU requires write locking, so we don't do it every time the * record is touched - only after some time passes. */ #ifndef DNS_RBTDB_LIMITLRUUPDATE #define DNS_RBTDB_LIMITLRUUPDATE 1 #endif /*% Time after which we update LRU for glue records, 5 minutes */ #define DNS_RBTDB_LRUUPDATE_GLUE 300 /*% Time after which we update LRU for all other records, 10 minutes */ #define DNS_RBTDB_LRUUPDATE_REGULAR 600 /* * Allow clients with a virtual time of up to 5 minutes in the past to see * records that would have otherwise have expired. */ #define RBTDB_VIRTUAL 300 struct noqname { dns_name_t name; void *neg; void *negsig; dns_rdatatype_t type; }; typedef struct rdatasetheader { /*% * Locked by the owning node's lock. */ rbtdb_serial_t serial; dns_ttl_t rdh_ttl; rbtdb_rdatatype_t type; atomic_uint_least16_t attributes; dns_trust_t trust; atomic_uint_fast32_t last_refresh_fail_ts; struct noqname *noqname; struct noqname *closest; unsigned int resign_lsb : 1; /*%< * We don't use the LIST macros, because the LIST structure has * both head and tail pointers, and is doubly linked. */ struct rdatasetheader *next; /*%< * If this is the top header for an rdataset, 'next' points * to the top header for the next rdataset (i.e., the next type). * Otherwise, it points up to the header whose down pointer points * at this header. */ struct rdatasetheader *down; /*%< * Points to the header for the next older version of * this rdataset. */ atomic_uint_fast32_t count; /*%< * Monotonously increased every time this rdataset is bound so that * it is used as the base of the starting point in DNS responses * when the "cyclic" rrset-order is required. */ dns_rbtnode_t *node; isc_stdtime_t last_used; ISC_LINK(struct rdatasetheader) link; unsigned int heap_index; /*%< * Used for TTL-based cache cleaning. */ isc_stdtime_t resign; /*%< * Case vector. If the bit is set then the corresponding * character in the owner name needs to be AND'd with 0x20, * rendering that character upper case. */ unsigned char upper[32]; } rdatasetheader_t; typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t; typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t; #define RDATASET_ATTR_NONEXISTENT 0x0001 /*%< May be potentially served as stale data. */ #define RDATASET_ATTR_STALE 0x0002 #define RDATASET_ATTR_IGNORE 0x0004 #define RDATASET_ATTR_RETAIN 0x0008 #define RDATASET_ATTR_NXDOMAIN 0x0010 #define RDATASET_ATTR_RESIGN 0x0020 #define RDATASET_ATTR_STATCOUNT 0x0040 #define RDATASET_ATTR_OPTOUT 0x0080 #define RDATASET_ATTR_NEGATIVE 0x0100 #define RDATASET_ATTR_PREFETCH 0x0200 #define RDATASET_ATTR_CASESET 0x0400 #define RDATASET_ATTR_ZEROTTL 0x0800 #define RDATASET_ATTR_CASEFULLYLOWER 0x1000 /*%< Ancient - awaiting cleanup. */ #define RDATASET_ATTR_ANCIENT 0x2000 #define RDATASET_ATTR_STALE_WINDOW 0x4000 /* * XXX * When the cache will pre-expire data (due to memory low or other * situations) before the rdataset's TTL has expired, it MUST * respect the RETAIN bit and not expire the data until its TTL is * expired. */ #define EXISTS(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_NONEXISTENT) == 0) #define NONEXISTENT(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_NONEXISTENT) != 0) #define IGNORE(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_IGNORE) != 0) #define RETAIN(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_RETAIN) != 0) #define NXDOMAIN(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_NXDOMAIN) != 0) #define STALE(header) \ ((atomic_load_acquire(&(header)->attributes) & RDATASET_ATTR_STALE) != \ 0) #define STALE_WINDOW(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_STALE_WINDOW) != 0) #define RESIGN(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_RESIGN) != 0) #define OPTOUT(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_OPTOUT) != 0) #define NEGATIVE(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_NEGATIVE) != 0) #define PREFETCH(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_PREFETCH) != 0) #define CASESET(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_CASESET) != 0) #define ZEROTTL(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_ZEROTTL) != 0) #define CASEFULLYLOWER(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_CASEFULLYLOWER) != 0) #define ANCIENT(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_ANCIENT) != 0) #define STATCOUNT(header) \ ((atomic_load_acquire(&(header)->attributes) & \ RDATASET_ATTR_STATCOUNT) != 0) #define STALE_TTL(header, rbtdb) (NXDOMAIN(header) ? 0 : rbtdb->serve_stale_ttl) #define RDATASET_ATTR_GET(header, attribute) \ (atomic_load_acquire(&(header)->attributes) & attribute) #define RDATASET_ATTR_SET(header, attribute) \ atomic_fetch_or_release(&(header)->attributes, attribute) #define RDATASET_ATTR_CLR(header, attribute) \ atomic_fetch_and_release(&(header)->attributes, ~(attribute)) #define ACTIVE(header, now) \ (((header)->rdh_ttl > (now)) || \ ((header)->rdh_ttl == (now) && ZEROTTL(header))) #define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */ #define RBTDB_GLUE_TABLE_INIT_BITS 2U #define RBTDB_GLUE_TABLE_MAX_BITS 32U #define RBTDB_GLUE_TABLE_OVERCOMMIT 3 #define GOLDEN_RATIO_32 0x61C88647 #define HASHSIZE(bits) (UINT64_C(1) << (bits)) static uint32_t hash_32(uint32_t val, unsigned int bits) { REQUIRE(bits <= RBTDB_GLUE_TABLE_MAX_BITS); /* High bits are more random. */ return (val * GOLDEN_RATIO_32 >> (32 - bits)); } #define EXPIREDOK(rbtiterator) \ (((rbtiterator)->common.options & DNS_DB_EXPIREDOK) != 0) #define STALEOK(rbtiterator) \ (((rbtiterator)->common.options & DNS_DB_STALEOK) != 0) /*% * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps). * There is a tradeoff issue about configuring this value: if this is too * small, it may cause heavier contention between threads; if this is too large, * LRU purge algorithm won't work well (entries tend to be purged prematurely). * The default value should work well for most environments, but this can * also be configurable at compilation time via the * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable. This value must be larger than * 1 due to the assumption of overmem_purge(). */ #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1" #else /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */ #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT #endif /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */ #else /* ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT */ #define DEFAULT_CACHE_NODE_LOCK_COUNT 17 #endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */ typedef struct { nodelock_t lock; /* Protected in the refcount routines. */ isc_refcount_t references; /* Locked by lock. */ bool exiting; } rbtdb_nodelock_t; typedef struct rbtdb_changed { dns_rbtnode_t *node; bool dirty; ISC_LINK(struct rbtdb_changed) link; } rbtdb_changed_t; typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t; typedef enum { dns_db_insecure, dns_db_partial, dns_db_secure } dns_db_secure_t; typedef struct dns_rbtdb dns_rbtdb_t; /* Reason for expiring a record from cache */ typedef enum { expire_lru, expire_ttl, expire_flush } expire_t; typedef struct rbtdb_glue rbtdb_glue_t; typedef struct rbtdb_glue_table_node { struct rbtdb_glue_table_node *next; dns_rbtnode_t *node; rbtdb_glue_t *glue_list; } rbtdb_glue_table_node_t; typedef enum { rdataset_ttl_fresh, rdataset_ttl_stale, rdataset_ttl_ancient } rdataset_ttl_t; typedef struct rbtdb_version { /* Not locked */ rbtdb_serial_t serial; dns_rbtdb_t *rbtdb; /* * Protected in the refcount routines. * XXXJT: should we change the lock policy based on the refcount * performance? */ isc_refcount_t references; /* Locked by database lock. */ bool writer; bool commit_ok; rbtdb_changedlist_t changed_list; rdatasetheaderlist_t resigned_list; ISC_LINK(struct rbtdb_version) link; dns_db_secure_t secure; bool havensec3; /* NSEC3 parameters */ dns_hash_t hash; uint8_t flags; uint16_t iterations; uint8_t salt_length; unsigned char salt[DNS_NSEC3_SALTSIZE]; /* * records and xfrsize are covered by rwlock. */ isc_rwlock_t rwlock; uint64_t records; uint64_t xfrsize; isc_rwlock_t glue_rwlock; size_t glue_table_bits; size_t glue_table_nodecount; rbtdb_glue_table_node_t **glue_table; } rbtdb_version_t; typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t; struct dns_rbtdb { /* Unlocked. */ dns_db_t common; /* Locks the data in this struct */ isc_rwlock_t lock; /* Locks the tree structure (prevents nodes appearing/disappearing) */ isc_rwlock_t tree_lock; /* Locks for individual tree nodes */ unsigned int node_lock_count; rbtdb_nodelock_t *node_locks; dns_rbtnode_t *origin_node; dns_rbtnode_t *nsec3_origin_node; dns_stats_t *rrsetstats; /* cache DB only */ isc_stats_t *cachestats; /* cache DB only */ isc_stats_t *gluecachestats; /* zone DB only */ /* Locked by lock. */ unsigned int active; isc_refcount_t references; unsigned int attributes; rbtdb_serial_t current_serial; rbtdb_serial_t least_serial; rbtdb_serial_t next_serial; rbtdb_version_t *current_version; rbtdb_version_t *future_version; rbtdb_versionlist_t open_versions; isc_task_t *task; dns_dbnode_t *soanode; dns_dbnode_t *nsnode; /* * Maximum length of time to keep using a stale answer past its * normal TTL expiry. */ dns_ttl_t serve_stale_ttl; /* * The time after a failed lookup, where stale answers from cache * may be used directly in a DNS response without attempting a * new iterative lookup. */ uint32_t serve_stale_refresh; /* * This is a linked list used to implement the LRU cache. There will * be node_lock_count linked lists here. Nodes in bucket 1 will be * placed on the linked list rdatasets[1]. */ rdatasetheaderlist_t *rdatasets; /* * Start point % node_lock_count for next LRU cleanup. */ atomic_uint lru_sweep; /* * When performing LRU cleaning limit cleaning to headers that were * last used at or before this. */ atomic_uint last_used; /*% * Temporary storage for stale cache nodes and dynamically deleted * nodes that await being cleaned up. */ rbtnodelist_t *deadnodes; /* List of nodes from which recursive tree pruning can be started from. * Locked by tree_lock. */ rbtnodelist_t prunenodes; /* * Heaps. These are used for TTL based expiry in a cache, * or for zone resigning in a zone DB. hmctx is the memory * context to use for the heap (which differs from the main * database memory context in the case of a cache). */ isc_mem_t *hmctx; isc_heap_t **heaps; /* Locked by tree_lock. */ dns_rbt_t *tree; dns_rbt_t *nsec; dns_rbt_t *nsec3; /* Unlocked */ unsigned int quantum; }; #define RBTDB_ATTR_LOADED 0x01 #define RBTDB_ATTR_LOADING 0x02 #define KEEPSTALE(rbtdb) ((rbtdb)->serve_stale_ttl > 0) /*% * Search Context */ typedef struct { dns_rbtdb_t *rbtdb; rbtdb_version_t *rbtversion; rbtdb_serial_t serial; unsigned int options; dns_rbtnodechain_t chain; bool copy_name; bool need_cleanup; bool wild; dns_rbtnode_t *zonecut; rdatasetheader_t *zonecut_rdataset; rdatasetheader_t *zonecut_sigrdataset; dns_fixedname_t zonecut_name; isc_stdtime_t now; } rbtdb_search_t; /*% * Load Context */ typedef struct { dns_rbtdb_t *rbtdb; isc_stdtime_t now; } rbtdb_load_t; static void delete_callback(void *data, void *arg); static void rdataset_disassociate(dns_rdataset_t *rdataset); static isc_result_t rdataset_first(dns_rdataset_t *rdataset); static isc_result_t rdataset_next(dns_rdataset_t *rdataset); static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata); static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target); static unsigned int rdataset_count(dns_rdataset_t *rdataset); static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name, dns_rdataset_t *neg, dns_rdataset_t *negsig); static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name, dns_rdataset_t *neg, dns_rdataset_t *negsig); static bool need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now); static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now); static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked, expire_t reason); static void overmem_purge(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked); static void resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader); static void resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, rdatasetheader_t *header); static void prune_tree(isc_task_t *task, isc_event_t *event); static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust); static void rdataset_expire(dns_rdataset_t *rdataset); static void rdataset_clearprefetch(dns_rdataset_t *rdataset); static void rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name); static void rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name); static isc_result_t rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version, dns_message_t *msg); static void free_gluetable(rbtdb_version_t *version); static isc_result_t nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name); static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate, rdataset_first, rdataset_next, rdataset_current, rdataset_clone, rdataset_count, NULL, /* addnoqname */ rdataset_getnoqname, NULL, /* addclosest */ rdataset_getclosest, rdataset_settrust, rdataset_expire, rdataset_clearprefetch, rdataset_setownercase, rdataset_getownercase, rdataset_addglue }; static dns_rdatasetmethods_t slab_methods = { rdataset_disassociate, rdataset_first, rdataset_next, rdataset_current, rdataset_clone, rdataset_count, NULL, /* addnoqname */ NULL, /* getnoqname */ NULL, /* addclosest */ NULL, /* getclosest */ NULL, /* settrust */ NULL, /* expire */ NULL, /* clearprefetch */ NULL, /* setownercase */ NULL, /* getownercase */ NULL /* addglue */ }; static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp); static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator); static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator); static void rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset); static dns_rdatasetitermethods_t rdatasetiter_methods = { rdatasetiter_destroy, rdatasetiter_first, rdatasetiter_next, rdatasetiter_current }; typedef struct rbtdb_rdatasetiter { dns_rdatasetiter_t common; rdatasetheader_t *current; } rbtdb_rdatasetiter_t; /* * Note that these iterators, unless created with either DNS_DB_NSEC3ONLY or * DNS_DB_NONSEC3, will transparently move between the last node of the * "regular" RBT ("chain" field) and the root node of the NSEC3 RBT * ("nsec3chain" field) of the database in question, as if the latter was a * successor to the former in lexical order. The "current" field always holds * the address of either "chain" or "nsec3chain", depending on which RBT is * being traversed at given time. */ static void dbiterator_destroy(dns_dbiterator_t **iteratorp); static isc_result_t dbiterator_first(dns_dbiterator_t *iterator); static isc_result_t dbiterator_last(dns_dbiterator_t *iterator); static isc_result_t dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name); static isc_result_t dbiterator_prev(dns_dbiterator_t *iterator); static isc_result_t dbiterator_next(dns_dbiterator_t *iterator); static isc_result_t dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep, dns_name_t *name); static isc_result_t dbiterator_pause(dns_dbiterator_t *iterator); static isc_result_t dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name); static dns_dbiteratormethods_t dbiterator_methods = { dbiterator_destroy, dbiterator_first, dbiterator_last, dbiterator_seek, dbiterator_prev, dbiterator_next, dbiterator_current, dbiterator_pause, dbiterator_origin }; #define DELETION_BATCH_MAX 64 /* * If 'paused' is true, then the tree lock is not being held. */ typedef struct rbtdb_dbiterator { dns_dbiterator_t common; bool paused; bool new_origin; isc_rwlocktype_t tree_locked; isc_result_t result; dns_fixedname_t name; dns_fixedname_t origin; dns_rbtnodechain_t chain; dns_rbtnodechain_t nsec3chain; dns_rbtnodechain_t *current; dns_rbtnode_t *node; dns_rbtnode_t *deletions[DELETION_BATCH_MAX]; int delcnt; bool nsec3only; bool nonsec3; } rbtdb_dbiterator_t; #define IS_STUB(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0) #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0) static void free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event); static void overmem(dns_db_t *db, bool over); static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version); static void setownercase(rdatasetheader_t *header, const dns_name_t *name); /*% * 'init_count' is used to initialize 'newheader->count' which inturn * is used to determine where in the cycle rrset-order cyclic starts. * We don't lock this as we don't care about simultaneous updates. * * Note: * Both init_count and header->count can be UINT32_MAX. * The count on the returned rdataset however can't be as * that indicates that the database does not implement cyclic * processing. */ static atomic_uint_fast32_t init_count = 0; /* * Locking * * If a routine is going to lock more than one lock in this module, then * the locking must be done in the following order: * * Tree Lock * * Node Lock (Only one from the set may be locked at one time by * any caller) * * Database Lock * * Failure to follow this hierarchy can result in deadlock. */ /* * Deleting Nodes * * For zone databases the node for the origin of the zone MUST NOT be deleted. */ /* Fixed RRSet helper macros */ #define DNS_RDATASET_LENGTH 2; #if DNS_RDATASET_FIXED #define DNS_RDATASET_ORDER 2 #define DNS_RDATASET_COUNT (count * 4) #else /* !DNS_RDATASET_FIXED */ #define DNS_RDATASET_ORDER 0 #define DNS_RDATASET_COUNT 0 #endif /* DNS_RDATASET_FIXED */ /* * DB Routines */ static void attach(dns_db_t *source, dns_db_t **targetp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source; REQUIRE(VALID_RBTDB(rbtdb)); isc_refcount_increment(&rbtdb->references); *targetp = source; } static void free_rbtdb_callback(isc_task_t *task, isc_event_t *event) { dns_rbtdb_t *rbtdb = event->ev_arg; UNUSED(task); free_rbtdb(rbtdb, true, event); } static void update_cachestats(dns_rbtdb_t *rbtdb, isc_result_t result) { INSIST(IS_CACHE(rbtdb)); if (rbtdb->cachestats == NULL) { return; } switch (result) { case DNS_R_COVERINGNSEC: isc_stats_increment(rbtdb->cachestats, dns_cachestatscounter_coveringnsec); FALLTHROUGH; case ISC_R_SUCCESS: case DNS_R_CNAME: case DNS_R_DNAME: case DNS_R_DELEGATION: case DNS_R_NCACHENXDOMAIN: case DNS_R_NCACHENXRRSET: isc_stats_increment(rbtdb->cachestats, dns_cachestatscounter_hits); break; default: isc_stats_increment(rbtdb->cachestats, dns_cachestatscounter_misses); } } static bool do_stats(rdatasetheader_t *header) { return (EXISTS(header) && STATCOUNT(header)); } static void update_rrsetstats(dns_rbtdb_t *rbtdb, const rbtdb_rdatatype_t htype, const uint_least16_t hattributes, const bool increment) { dns_rdatastatstype_t statattributes = 0; dns_rdatastatstype_t base = 0; dns_rdatastatstype_t type; rdatasetheader_t *header = &(rdatasetheader_t){ .type = htype, .attributes = hattributes, }; if (!do_stats(header)) { return; } /* At the moment we count statistics only for cache DB */ INSIST(IS_CACHE(rbtdb)); if (NEGATIVE(header)) { if (NXDOMAIN(header)) { statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN; } else { statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET; base = RBTDB_RDATATYPE_EXT(header->type); } } else { base = RBTDB_RDATATYPE_BASE(header->type); } if (STALE(header)) { statattributes |= DNS_RDATASTATSTYPE_ATTR_STALE; } if (ANCIENT(header)) { statattributes |= DNS_RDATASTATSTYPE_ATTR_ANCIENT; } type = DNS_RDATASTATSTYPE_VALUE(base, statattributes); if (increment) { dns_rdatasetstats_increment(rbtdb->rrsetstats, type); } else { dns_rdatasetstats_decrement(rbtdb->rrsetstats, type); } } static void set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) { int idx; isc_heap_t *heap; dns_ttl_t oldttl; if (!IS_CACHE(rbtdb)) { header->rdh_ttl = newttl; return; } oldttl = header->rdh_ttl; header->rdh_ttl = newttl; /* * It's possible the rbtdb is not a cache. If this is the case, * we will not have a heap, and we move on. If we do, though, * we might need to adjust things. */ if (header->heap_index == 0 || newttl == oldttl) { return; } idx = header->node->locknum; if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL) { return; } heap = rbtdb->heaps[idx]; if (newttl < oldttl) { isc_heap_increased(heap, header->heap_index); } else { isc_heap_decreased(heap, header->heap_index); } } /*% * These functions allow the heap code to rank the priority of each * element. It returns true if v1 happens "sooner" than v2. */ static bool ttl_sooner(void *v1, void *v2) { rdatasetheader_t *h1 = v1; rdatasetheader_t *h2 = v2; return (h1->rdh_ttl < h2->rdh_ttl); } /*% * Return which RRset should be resigned sooner. If the RRsets have the * same signing time, prefer the other RRset over the SOA RRset. */ static bool resign_sooner(void *v1, void *v2) { rdatasetheader_t *h1 = v1; rdatasetheader_t *h2 = v2; return (h1->resign < h2->resign || (h1->resign == h2->resign && h1->resign_lsb < h2->resign_lsb) || (h1->resign == h2->resign && h1->resign_lsb == h2->resign_lsb && h2->type == RBTDB_RDATATYPE_SIGSOA)); } /*% * This function sets the heap index into the header. */ static void set_index(void *what, unsigned int idx) { rdatasetheader_t *h = what; h->heap_index = idx; } /*% * Work out how many nodes can be deleted in the time between two * requests to the nameserver. Smooth the resulting number and use it * as a estimate for the number of nodes to be deleted in the next * iteration. */ static unsigned int adjust_quantum(unsigned int old, isc_time_t *start) { unsigned int pps = dns_pps; /* packets per second */ unsigned int interval; uint64_t usecs; isc_time_t end; unsigned int nodes; if (pps < 100) { pps = 100; } isc_time_now(&end); interval = 1000000 / pps; /* interval in usec */ if (interval == 0) { interval = 1; } usecs = isc_time_microdiff(&end, start); if (usecs == 0) { /* * We were unable to measure the amount of time taken. * Double the nodes deleted next time. */ old *= 2; if (old > 1000) { old = 1000; } return (old); } nodes = old * interval; nodes /= (unsigned int)usecs; if (nodes == 0) { nodes = 1; } else if (nodes > 1000) { nodes = 1000; } /* Smooth */ nodes = (nodes + old * 3) / 4; if (nodes != old) { isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), "adjust_quantum: old=%d, new=%d", old, nodes); } return (nodes); } static void free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) { unsigned int i; isc_result_t result; char buf[DNS_NAME_FORMATSIZE]; dns_rbtnode_t *node = NULL; dns_rbt_t **treep; isc_time_t start; if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) { overmem((dns_db_t *)rbtdb, (bool)-1); } REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions)); REQUIRE(rbtdb->future_version == NULL); if (rbtdb->current_version != NULL) { isc_refcount_decrementz(&rbtdb->current_version->references); UNLINK(rbtdb->open_versions, rbtdb->current_version, link); isc_rwlock_destroy(&rbtdb->current_version->glue_rwlock); isc_refcount_destroy(&rbtdb->current_version->references); isc_rwlock_destroy(&rbtdb->current_version->rwlock); isc_mem_put(rbtdb->common.mctx, rbtdb->current_version, sizeof(rbtdb_version_t)); } /* * We assume the number of remaining dead nodes is reasonably small; * the overhead of unlinking all nodes here should be negligible. */ for (i = 0; i < rbtdb->node_lock_count; i++) { node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); while (node != NULL) { ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink); node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); } } node = ISC_LIST_HEAD(rbtdb->prunenodes); while (node != NULL) { ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink); node = ISC_LIST_HEAD(rbtdb->prunenodes); } if (event == NULL) { rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0; } for (;;) { /* * pick the next tree to (start to) destroy */ treep = &rbtdb->tree; if (*treep == NULL) { treep = &rbtdb->nsec; if (*treep == NULL) { treep = &rbtdb->nsec3; /* * we're finished after clear cutting */ if (*treep == NULL) { break; } } } isc_time_now(&start); result = dns_rbt_destroy2(treep, rbtdb->quantum); if (result == ISC_R_QUOTA) { INSIST(rbtdb->task != NULL); if (rbtdb->quantum != 0) { rbtdb->quantum = adjust_quantum(rbtdb->quantum, &start); } if (event == NULL) { event = isc_event_allocate( rbtdb->common.mctx, NULL, DNS_EVENT_FREESTORAGE, free_rbtdb_callback, rbtdb, sizeof(isc_event_t)); } isc_task_send(rbtdb->task, &event); return; } INSIST(result == ISC_R_SUCCESS && *treep == NULL); } if (event != NULL) { isc_event_free(&event); } if (log) { if (dns_name_dynamic(&rbtdb->common.origin)) { dns_name_format(&rbtdb->common.origin, buf, sizeof(buf)); } else { strlcpy(buf, "", sizeof(buf)); } isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), "done free_rbtdb(%s)", buf); } if (dns_name_dynamic(&rbtdb->common.origin)) { dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx); } for (i = 0; i < rbtdb->node_lock_count; i++) { isc_refcount_destroy(&rbtdb->node_locks[i].references); NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); } /* * Clean up LRU / re-signing order lists. */ if (rbtdb->rdatasets != NULL) { for (i = 0; i < rbtdb->node_lock_count; i++) { INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i])); } isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets, rbtdb->node_lock_count * sizeof(rdatasetheaderlist_t)); } /* * Clean up dead node buckets. */ if (rbtdb->deadnodes != NULL) { for (i = 0; i < rbtdb->node_lock_count; i++) { INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i])); } isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes, rbtdb->node_lock_count * sizeof(rbtnodelist_t)); } /* * Clean up heap objects. */ if (rbtdb->heaps != NULL) { for (i = 0; i < rbtdb->node_lock_count; i++) { isc_heap_destroy(&rbtdb->heaps[i]); } isc_mem_put(rbtdb->hmctx, rbtdb->heaps, rbtdb->node_lock_count * sizeof(isc_heap_t *)); } if (rbtdb->rrsetstats != NULL) { dns_stats_detach(&rbtdb->rrsetstats); } if (rbtdb->cachestats != NULL) { isc_stats_detach(&rbtdb->cachestats); } if (rbtdb->gluecachestats != NULL) { isc_stats_detach(&rbtdb->gluecachestats); } isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); isc_rwlock_destroy(&rbtdb->tree_lock); isc_refcount_destroy(&rbtdb->references); if (rbtdb->task != NULL) { isc_task_detach(&rbtdb->task); } RBTDB_DESTROYLOCK(&rbtdb->lock); rbtdb->common.magic = 0; rbtdb->common.impmagic = 0; isc_mem_detach(&rbtdb->hmctx); INSIST(ISC_LIST_EMPTY(rbtdb->common.update_listeners)); isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb)); } static void maybe_free_rbtdb(dns_rbtdb_t *rbtdb) { bool want_free = false; unsigned int i; unsigned int inactive = 0; /* XXX check for open versions here */ if (rbtdb->soanode != NULL) { dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode); } if (rbtdb->nsnode != NULL) { dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode); } /* * The current version's glue table needs to be freed early * so the nodes are dereferenced before we check the active * node count below. */ if (rbtdb->current_version != NULL) { free_gluetable(rbtdb->current_version); } /* * Even though there are no external direct references, there still * may be nodes in use. */ for (i = 0; i < rbtdb->node_lock_count; i++) { NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); rbtdb->node_locks[i].exiting = true; if (isc_refcount_current(&rbtdb->node_locks[i].references) == 0) { inactive++; } NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); } if (inactive != 0) { RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); rbtdb->active -= inactive; if (rbtdb->active == 0) { want_free = true; } RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (want_free) { char buf[DNS_NAME_FORMATSIZE]; if (dns_name_dynamic(&rbtdb->common.origin)) { dns_name_format(&rbtdb->common.origin, buf, sizeof(buf)); } else { strlcpy(buf, "", sizeof(buf)); } isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), "calling free_rbtdb(%s)", buf); free_rbtdb(rbtdb, true, NULL); } } } static void detach(dns_db_t **dbp) { REQUIRE(dbp != NULL && VALID_RBTDB((dns_rbtdb_t *)(*dbp))); dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp); *dbp = NULL; if (isc_refcount_decrement(&rbtdb->references) == 1) { maybe_free_rbtdb(rbtdb); } } static void currentversion(dns_db_t *db, dns_dbversion_t **versionp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *version; REQUIRE(VALID_RBTDB(rbtdb)); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); version = rbtdb->current_version; isc_refcount_increment(&version->references); RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); *versionp = (dns_dbversion_t *)version; } static rbtdb_version_t * allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial, unsigned int references, bool writer) { rbtdb_version_t *version; size_t size; version = isc_mem_get(mctx, sizeof(*version)); version->serial = serial; isc_refcount_init(&version->references, references); isc_rwlock_init(&version->glue_rwlock, 0, 0); version->glue_table_bits = RBTDB_GLUE_TABLE_INIT_BITS; version->glue_table_nodecount = 0U; size = HASHSIZE(version->glue_table_bits) * sizeof(version->glue_table[0]); version->glue_table = isc_mem_get(mctx, size); memset(version->glue_table, 0, size); version->writer = writer; version->commit_ok = false; ISC_LIST_INIT(version->changed_list); ISC_LIST_INIT(version->resigned_list); ISC_LINK_INIT(version, link); return (version); } static isc_result_t newversion(dns_db_t *db, dns_dbversion_t **versionp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *version; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(versionp != NULL && *versionp == NULL); REQUIRE(rbtdb->future_version == NULL); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */ version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1, true); version->rbtdb = rbtdb; version->commit_ok = true; version->secure = rbtdb->current_version->secure; version->havensec3 = rbtdb->current_version->havensec3; if (version->havensec3) { version->flags = rbtdb->current_version->flags; version->iterations = rbtdb->current_version->iterations; version->hash = rbtdb->current_version->hash; version->salt_length = rbtdb->current_version->salt_length; memmove(version->salt, rbtdb->current_version->salt, version->salt_length); } else { version->flags = 0; version->iterations = 0; version->hash = 0; version->salt_length = 0; memset(version->salt, 0, sizeof(version->salt)); } isc_rwlock_init(&version->rwlock, 0, 0); RWLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read); version->records = rbtdb->current_version->records; version->xfrsize = rbtdb->current_version->xfrsize; RWUNLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read); rbtdb->next_serial++; rbtdb->future_version = version; RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); *versionp = version; return (ISC_R_SUCCESS); } static void attachversion(dns_db_t *db, dns_dbversion_t *source, dns_dbversion_t **targetp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *rbtversion = source; REQUIRE(VALID_RBTDB(rbtdb)); INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb); isc_refcount_increment(&rbtversion->references); *targetp = rbtversion; } static rbtdb_changed_t * add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, dns_rbtnode_t *node) { rbtdb_changed_t *changed; /* * Caller must be holding the node lock if its reference must be * protected by the lock. */ changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed)); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE(version->writer); if (changed != NULL) { isc_refcount_increment(&node->references); changed->node = node; changed->dirty = false; ISC_LIST_INITANDAPPEND(version->changed_list, changed, link); } else { version->commit_ok = false; } RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); return (changed); } static void free_noqname(isc_mem_t *mctx, struct noqname **noqname) { if (dns_name_dynamic(&(*noqname)->name)) { dns_name_free(&(*noqname)->name, mctx); } if ((*noqname)->neg != NULL) { isc_mem_put(mctx, (*noqname)->neg, dns_rdataslab_size((*noqname)->neg, 0)); } if ((*noqname)->negsig != NULL) { isc_mem_put(mctx, (*noqname)->negsig, dns_rdataslab_size((*noqname)->negsig, 0)); } isc_mem_put(mctx, *noqname, sizeof(**noqname)); *noqname = NULL; } static void init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h) { ISC_LINK_INIT(h, link); h->heap_index = 0; atomic_init(&h->attributes, 0); atomic_init(&h->last_refresh_fail_ts, 0); STATIC_ASSERT((sizeof(h->attributes) == 2), "The .attributes field of rdatasetheader_t needs to be " "16-bit int type exactly."); #if TRACE_HEADER if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) { fprintf(stderr, "initialized header: %p\n", h); } #else /* if TRACE_HEADER */ UNUSED(rbtdb); #endif /* if TRACE_HEADER */ } static void update_newheader(rdatasetheader_t *newh, rdatasetheader_t *old) { if (CASESET(old)) { uint_least16_t attr = RDATASET_ATTR_GET( old, (RDATASET_ATTR_CASESET | RDATASET_ATTR_CASEFULLYLOWER)); RDATASET_ATTR_SET(newh, attr); memmove(newh->upper, old->upper, sizeof(old->upper)); } } static rdatasetheader_t * new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx) { rdatasetheader_t *h; h = isc_mem_get(mctx, sizeof(*h)); #if TRACE_HEADER if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) { fprintf(stderr, "allocated header: %p\n", h); } #endif /* if TRACE_HEADER */ memset(h->upper, 0xeb, sizeof(h->upper)); init_rdataset(rbtdb, h); h->rdh_ttl = 0; return (h); } static void free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset) { unsigned int size; int idx; update_rrsetstats(rbtdb, rdataset->type, atomic_load_acquire(&rdataset->attributes), false); idx = rdataset->node->locknum; if (ISC_LINK_LINKED(rdataset, link)) { INSIST(IS_CACHE(rbtdb)); ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, link); } if (rdataset->heap_index != 0) { isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index); } rdataset->heap_index = 0; if (rdataset->noqname != NULL) { free_noqname(mctx, &rdataset->noqname); } if (rdataset->closest != NULL) { free_noqname(mctx, &rdataset->closest); } if (NONEXISTENT(rdataset)) { size = sizeof(*rdataset); } else { size = dns_rdataslab_size((unsigned char *)rdataset, sizeof(*rdataset)); } isc_mem_put(mctx, rdataset, size); } static void rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) { rdatasetheader_t *header, *dcurrent; bool make_dirty = false; /* * Caller must hold the node lock. */ /* * We set the IGNORE attribute on rdatasets with serial number * 'serial'. When the reference count goes to zero, these rdatasets * will be cleaned up; until that time, they will be ignored. */ for (header = node->data; header != NULL; header = header->next) { if (header->serial == serial) { RDATASET_ATTR_SET(header, RDATASET_ATTR_IGNORE); make_dirty = true; } for (dcurrent = header->down; dcurrent != NULL; dcurrent = dcurrent->down) { if (dcurrent->serial == serial) { RDATASET_ATTR_SET(dcurrent, RDATASET_ATTR_IGNORE); make_dirty = true; } } } if (make_dirty) { node->dirty = 1; } } static void mark_header_ancient(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) { uint_least16_t attributes = atomic_load_acquire(&header->attributes); uint_least16_t newattributes = 0; /* * If we are already ancient there is nothing to do. */ do { if ((attributes & RDATASET_ATTR_ANCIENT) != 0) { return; } newattributes = attributes | RDATASET_ATTR_ANCIENT; } while (!atomic_compare_exchange_weak_acq_rel( &header->attributes, &attributes, newattributes)); /* * Decrement the stats counter for the appropriate RRtype. * If the STALE attribute is set, this will decrement the * stale type counter, otherwise it decrements the active * stats type counter. */ update_rrsetstats(rbtdb, header->type, attributes, false); header->node->dirty = 1; /* Increment the stats counter for the ancient RRtype. */ update_rrsetstats(rbtdb, header->type, newattributes, true); } static void mark_header_stale(dns_rbtdb_t *rbtdb, rdatasetheader_t *header) { uint_least16_t attributes = atomic_load_acquire(&header->attributes); uint_least16_t newattributes = 0; INSIST((attributes & RDATASET_ATTR_ZEROTTL) == 0); /* * If we are already stale there is nothing to do. */ do { if ((attributes & RDATASET_ATTR_STALE) != 0) { return; } newattributes = attributes | RDATASET_ATTR_STALE; } while (!atomic_compare_exchange_weak_acq_rel( &header->attributes, &attributes, newattributes)); /* Decrement the stats counter for the appropriate RRtype. * If the ANCIENT attribute is set (although it is very * unlikely that an RRset goes from ANCIENT to STALE), this * will decrement the ancient stale type counter, otherwise it * decrements the active stats type counter. */ update_rrsetstats(rbtdb, header->type, attributes, false); update_rrsetstats(rbtdb, header->type, newattributes, true); } static void clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top) { rdatasetheader_t *d, *down_next; for (d = top->down; d != NULL; d = down_next) { down_next = d->down; free_rdataset(rbtdb, mctx, d); } top->down = NULL; } static void clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { rdatasetheader_t *current, *top_prev, *top_next; isc_mem_t *mctx = rbtdb->common.mctx; /* * Caller must be holding the node lock. */ top_prev = NULL; for (current = node->data; current != NULL; current = top_next) { top_next = current->next; clean_stale_headers(rbtdb, mctx, current); /* * If current is nonexistent, ancient, or stale and * we are not keeping stale, we can clean it up. */ if (NONEXISTENT(current) || ANCIENT(current) || (STALE(current) && !KEEPSTALE(rbtdb))) { if (top_prev != NULL) { top_prev->next = current->next; } else { node->data = current->next; } free_rdataset(rbtdb, mctx, current); } else { top_prev = current; } } node->dirty = 0; } static void clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rbtdb_serial_t least_serial) { rdatasetheader_t *current, *dcurrent, *down_next, *dparent; rdatasetheader_t *top_prev, *top_next; isc_mem_t *mctx = rbtdb->common.mctx; bool still_dirty = false; /* * Caller must be holding the node lock. */ REQUIRE(least_serial != 0); top_prev = NULL; for (current = node->data; current != NULL; current = top_next) { top_next = current->next; /* * First, we clean up any instances of multiple rdatasets * with the same serial number, or that have the IGNORE * attribute. */ dparent = current; for (dcurrent = current->down; dcurrent != NULL; dcurrent = down_next) { down_next = dcurrent->down; INSIST(dcurrent->serial <= dparent->serial); if (dcurrent->serial == dparent->serial || IGNORE(dcurrent)) { if (down_next != NULL) { down_next->next = dparent; } dparent->down = down_next; free_rdataset(rbtdb, mctx, dcurrent); } else { dparent = dcurrent; } } /* * We've now eliminated all IGNORE datasets with the possible * exception of current, which we now check. */ if (IGNORE(current)) { down_next = current->down; if (down_next == NULL) { if (top_prev != NULL) { top_prev->next = current->next; } else { node->data = current->next; } free_rdataset(rbtdb, mctx, current); /* * current no longer exists, so we can * just continue with the loop. */ continue; } else { /* * Pull up current->down, making it the new * current. */ if (top_prev != NULL) { top_prev->next = down_next; } else { node->data = down_next; } down_next->next = top_next; free_rdataset(rbtdb, mctx, current); current = down_next; } } /* * We now try to find the first down node less than the * least serial. */ dparent = current; for (dcurrent = current->down; dcurrent != NULL; dcurrent = down_next) { down_next = dcurrent->down; if (dcurrent->serial < least_serial) { break; } dparent = dcurrent; } /* * If there is a such an rdataset, delete it and any older * versions. */ if (dcurrent != NULL) { do { down_next = dcurrent->down; INSIST(dcurrent->serial <= least_serial); free_rdataset(rbtdb, mctx, dcurrent); dcurrent = down_next; } while (dcurrent != NULL); dparent->down = NULL; } /* * Note. The serial number of 'current' might be less than * least_serial too, but we cannot delete it because it is * the most recent version, unless it is a NONEXISTENT * rdataset. */ if (current->down != NULL) { still_dirty = true; top_prev = current; } else { /* * If this is a NONEXISTENT rdataset, we can delete it. */ if (NONEXISTENT(current)) { if (top_prev != NULL) { top_prev->next = current->next; } else { node->data = current->next; } free_rdataset(rbtdb, mctx, current); } else { top_prev = current; } } } if (!still_dirty) { node->dirty = 0; } } /* * tree_lock(write) must be held. */ static void delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { dns_rbtnode_t *nsecnode; dns_fixedname_t fname; dns_name_t *name; isc_result_t result = ISC_R_UNEXPECTED; INSIST(!ISC_LINK_LINKED(node, deadlink)); if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { char printname[DNS_NAME_FORMATSIZE]; isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), "delete_node(): %p %s (bucket %d)", node, dns_rbt_formatnodename(node, printname, sizeof(printname)), node->locknum); } switch (node->nsec) { case DNS_RBT_NSEC_NORMAL: result = dns_rbt_deletenode(rbtdb->tree, node, false); break; case DNS_RBT_NSEC_HAS_NSEC: /* * Though this may be wasteful, it has to be done before * node is deleted. */ name = dns_fixedname_initname(&fname); dns_rbt_fullnamefromnode(node, name); /* * Delete the corresponding node from the auxiliary NSEC * tree before deleting from the main tree. */ nsecnode = NULL; result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode, NULL, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result != ISC_R_SUCCESS) { isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, "delete_node: " "dns_rbt_findnode(nsec): %s", isc_result_totext(result)); } else { result = dns_rbt_deletenode(rbtdb->nsec, nsecnode, false); if (result != ISC_R_SUCCESS) { isc_log_write( dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, "delete_node(): " "dns_rbt_deletenode(nsecnode): %s", isc_result_totext(result)); } } result = dns_rbt_deletenode(rbtdb->tree, node, false); break; case DNS_RBT_NSEC_NSEC: result = dns_rbt_deletenode(rbtdb->nsec, node, false); break; case DNS_RBT_NSEC_NSEC3: result = dns_rbt_deletenode(rbtdb->nsec3, node, false); break; } if (result != ISC_R_SUCCESS) { isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, "delete_node(): " "dns_rbt_deletenode: %s", isc_result_totext(result)); } } /* * Caller must be holding the node lock. */ static void new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, isc_rwlocktype_t locktype) { if (locktype == isc_rwlocktype_write && ISC_LINK_LINKED(node, deadlink)) { ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node, deadlink); } if (isc_refcount_increment0(&node->references) == 0) { /* this is the first reference to the node */ isc_refcount_increment0( &rbtdb->node_locks[node->locknum].references); } } /*% * The tree lock must be held for the result to be valid. */ static bool is_leaf(dns_rbtnode_t *node) { return (node->parent != NULL && node->parent->down == node && node->left == NULL && node->right == NULL); } /*% * The tree lock must be held when this function is called as it reads and * updates rbtdb->prunenodes. */ static void send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, isc_rwlocktype_t locktype) { bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL); INSIST(locktype == isc_rwlocktype_write); new_reference(rbtdb, node, locktype); INSIST(!ISC_LINK_LINKED(node, prunelink)); ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink); if (!pruning_queued) { isc_event_t *ev = NULL; dns_db_t *db = NULL; attach((dns_db_t *)rbtdb, &db); ev = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTPRUNE, prune_tree, db, sizeof(isc_event_t)); isc_task_send(rbtdb->task, &ev); } } /*% * Clean up dead nodes. These are nodes which have no references, and * have no data. They are dead but we could not or chose not to delete * them when we deleted all the data at that node because we did not want * to wait for the tree write lock. * * The caller must hold a tree write lock and bucketnum'th node (write) lock. */ static void cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) { dns_rbtnode_t *node; int count = 10; /* XXXJT: should be adjustable */ node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]); while (node != NULL && count > 0) { ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink); /* * We might have reactivated this node without a tree write * lock, so we couldn't remove this node from deadnodes then * and we have to do it now. */ if (isc_refcount_current(&node->references) != 0 || node->data != NULL) { node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]); count--; continue; } if (is_leaf(node) && rbtdb->task != NULL) { send_to_prune_tree(rbtdb, node, isc_rwlocktype_write); } else if (node->down == NULL && node->data == NULL) { /* * Not a interior node and not needing to be * reactivated. */ delete_node(rbtdb, node); } else if (node->data == NULL) { /* * A interior node without data. Leave linked to * to be cleaned up when node->down becomes NULL. */ ISC_LIST_APPEND(rbtdb->deadnodes[bucketnum], node, deadlink); } node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]); count--; } } /* * This function is assumed to be called when a node is newly referenced * and can be in the deadnode list. In that case the node must be retrieved * from the list because it is going to be used. In addition, if the caller * happens to hold a write lock on the tree, it's a good chance to purge dead * nodes. * Note: while a new reference is gained in multiple places, there are only very * few cases where the node can be in the deadnode list (only empty nodes can * have been added to the list). */ static void reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, isc_rwlocktype_t treelocktype) { isc_rwlocktype_t locktype = isc_rwlocktype_read; nodelock_t *nodelock = &rbtdb->node_locks[node->locknum].lock; bool maybe_cleanup = false; POST(locktype); NODE_LOCK(nodelock, locktype); /* * Check if we can possibly cleanup the dead node. If so, upgrade * the node lock below to perform the cleanup. */ if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) && treelocktype == isc_rwlocktype_write) { maybe_cleanup = true; } if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) { /* * Upgrade the lock and test if we still need to unlink. */ NODE_UNLOCK(nodelock, locktype); locktype = isc_rwlocktype_write; POST(locktype); NODE_LOCK(nodelock, locktype); if (ISC_LINK_LINKED(node, deadlink)) { ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node, deadlink); } if (maybe_cleanup) { cleanup_dead_nodes(rbtdb, node->locknum); } } new_reference(rbtdb, node, locktype); NODE_UNLOCK(nodelock, locktype); } /* * Caller must be holding the node lock; either the "strong", read or write * lock. Note that the lock must be held even when node references are * atomically modified; in that case the decrement operation itself does not * have to be protected, but we must avoid a race condition where multiple * threads are decreasing the reference to zero simultaneously and at least * one of them is going to free the node. * * This function returns true if and only if the node reference decreases * to zero. * * NOTE: Decrementing the reference count of a node to zero does not mean it * will be immediately freed. */ static bool decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rbtdb_serial_t least_serial, isc_rwlocktype_t nlock, isc_rwlocktype_t tlock, bool pruning) { isc_result_t result; bool write_locked; bool locked = tlock != isc_rwlocktype_none; rbtdb_nodelock_t *nodelock; int bucket = node->locknum; bool no_reference = true; uint_fast32_t refs; nodelock = &rbtdb->node_locks[bucket]; #define KEEP_NODE(n, r, l) \ ((n)->data != NULL || ((l) && (n)->down != NULL) || \ (n) == (r)->origin_node || (n) == (r)->nsec3_origin_node) /* Handle easy and typical case first. */ if (!node->dirty && KEEP_NODE(node, rbtdb, locked)) { if (isc_refcount_decrement(&node->references) == 1) { refs = isc_refcount_decrement(&nodelock->references); INSIST(refs > 0); return (true); } else { return (false); } } /* Upgrade the lock? */ if (nlock == isc_rwlocktype_read) { NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read); NODE_LOCK(&nodelock->lock, isc_rwlocktype_write); } if (isc_refcount_decrement(&node->references) > 1) { /* Restore the lock? */ if (nlock == isc_rwlocktype_read) { NODE_DOWNGRADE(&nodelock->lock); } return (false); } if (node->dirty) { if (IS_CACHE(rbtdb)) { clean_cache_node(rbtdb, node); } else { if (least_serial == 0) { /* * Caller doesn't know the least serial. * Get it. */ RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); least_serial = rbtdb->least_serial; RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); } clean_zone_node(rbtdb, node, least_serial); } } /* * Attempt to switch to a write lock on the tree. If this fails, * we will add this node to a linked list of nodes in this locking * bucket which we will free later. */ if (tlock != isc_rwlocktype_write) { /* * Locking hierarchy notwithstanding, we don't need to free * the node lock before acquiring the tree write lock because * we only do a trylock. */ if (tlock == isc_rwlocktype_read) { result = isc_rwlock_tryupgrade(&rbtdb->tree_lock); } else { result = isc_rwlock_trylock(&rbtdb->tree_lock, isc_rwlocktype_write); } RUNTIME_CHECK(result == ISC_R_SUCCESS || result == ISC_R_LOCKBUSY); write_locked = (result == ISC_R_SUCCESS); } else { write_locked = true; } refs = isc_refcount_decrement(&nodelock->references); INSIST(refs > 0); if (KEEP_NODE(node, rbtdb, locked || write_locked)) { goto restore_locks; } #undef KEEP_NODE if (write_locked) { /* * We can now delete the node. */ /* * If this node is the only one in the level it's in, deleting * this node may recursively make its parent the only node in * the parent level; if so, and if no one is currently using * the parent node, this is almost the only opportunity to * clean it up. But the recursive cleanup is not that trivial * since the child and parent may be in different lock buckets, * which would cause a lock order reversal problem. To avoid * the trouble, we'll dispatch a separate event for batch * cleaning. We need to check whether we're deleting the node * as a result of pruning to avoid infinite dispatching. * Note: pruning happens only when a task has been set for the * rbtdb. If the user of the rbtdb chooses not to set a task, * it's their responsibility to purge stale leaves (e.g. by * periodic walk-through). */ if (!pruning && is_leaf(node) && rbtdb->task != NULL) { send_to_prune_tree(rbtdb, node, isc_rwlocktype_write); no_reference = false; } else { delete_node(rbtdb, node); } } else { INSIST(node->data == NULL); if (!ISC_LINK_LINKED(node, deadlink)) { ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink); } } restore_locks: /* Restore the lock? */ if (nlock == isc_rwlocktype_read) { NODE_DOWNGRADE(&nodelock->lock); } /* * Relock a read lock, or unlock the write lock if no lock was held. */ if (tlock == isc_rwlocktype_none) { if (write_locked) { RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } } if (tlock == isc_rwlocktype_read) { if (write_locked) { isc_rwlock_downgrade(&rbtdb->tree_lock); } } return (no_reference); } /* * Prune the tree by recursively cleaning up single leaves. Go through all * nodes stored in the rbtdb->prunenodes list; for each of them, in the worst * case, it will be necessary to traverse a number of tree levels equal to the * maximum legal number of domain name labels (127); in practice, the number of * tree levels to traverse will virtually always be much smaller (a few levels * at most). While holding the tree lock throughout this entire operation is * less than ideal, so is splitting the latter up by queueing a separate * prune_tree() run for each node to start pruning from (as queueing requires * allocating memory and can therefore potentially be exploited to exhaust * available memory). Also note that actually freeing up the memory used by * RBTDB nodes (which is what this function does) is essential to keeping cache * memory use in check, so since the tree lock needs to be acquired anyway, * freeing as many nodes as possible before the tree lock gets released is * prudent. */ static void prune_tree(isc_task_t *task, isc_event_t *event) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)event->ev_arg; dns_rbtnode_t *node = NULL; dns_rbtnode_t *parent = NULL; unsigned int locknum; UNUSED(task); isc_event_free(&event); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) { locknum = node->locknum; NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); do { if (ISC_LINK_LINKED(node, prunelink)) { ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink); } parent = node->parent; decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, isc_rwlocktype_write, true); if (parent != NULL && parent->down == NULL) { /* * node was the only down child of the parent * and has just been removed. We'll then need * to examine the parent. Keep the lock if * possible; otherwise, release the old lock and * acquire one for the parent. */ if (parent->locknum != locknum) { NODE_UNLOCK( &rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); locknum = parent->locknum; NODE_LOCK( &rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); } /* * We need to gain a reference to the node * before decrementing it in the next iteration. */ if (ISC_LINK_LINKED(parent, deadlink)) { ISC_LIST_UNLINK( rbtdb->deadnodes[locknum], parent, deadlink); } new_reference(rbtdb, parent, isc_rwlocktype_write); } else { parent = NULL; } node = parent; } while (node != NULL); NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); } RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); detach((dns_db_t **)(void *)&rbtdb); } static void make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) { /* * Caller must be holding the database lock. */ rbtdb->least_serial = version->serial; *cleanup_list = version->changed_list; ISC_LIST_INIT(version->changed_list); } static void cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) { rbtdb_changed_t *changed, *next_changed; /* * If the changed record is dirty, then * an update created multiple versions of * a given rdataset. We keep this list * until we're the least open version, at * which point it's safe to get rid of any * older versions. * * If the changed record isn't dirty, then * we don't need it anymore since we're * committing and not rolling back. * * The caller must be holding the database lock. */ for (changed = HEAD(version->changed_list); changed != NULL; changed = next_changed) { next_changed = NEXT(changed, link); if (!changed->dirty) { UNLINK(version->changed_list, changed, link); APPEND(*cleanup_list, changed, link); } } } static void iszonesecure(dns_db_t *db, rbtdb_version_t *version, dns_dbnode_t *origin) { dns_rdataset_t keyset; dns_rdataset_t nsecset, signsecset; bool haszonekey = false; bool hasnsec = false; isc_result_t result; dns_rdataset_init(&keyset); result = dns_db_findrdataset(db, origin, version, dns_rdatatype_dnskey, 0, 0, &keyset, NULL); if (result == ISC_R_SUCCESS) { result = dns_rdataset_first(&keyset); while (result == ISC_R_SUCCESS) { dns_rdata_t keyrdata = DNS_RDATA_INIT; dns_rdataset_current(&keyset, &keyrdata); if (dns_zonekey_iszonekey(&keyrdata)) { haszonekey = true; break; } result = dns_rdataset_next(&keyset); } dns_rdataset_disassociate(&keyset); } if (!haszonekey) { version->secure = dns_db_insecure; version->havensec3 = false; return; } dns_rdataset_init(&nsecset); dns_rdataset_init(&signsecset); result = dns_db_findrdataset(db, origin, version, dns_rdatatype_nsec, 0, 0, &nsecset, &signsecset); if (result == ISC_R_SUCCESS) { if (dns_rdataset_isassociated(&signsecset)) { hasnsec = true; dns_rdataset_disassociate(&signsecset); } dns_rdataset_disassociate(&nsecset); } setnsec3parameters(db, version); /* * Do we have a valid NSEC/NSEC3 chain? */ if (version->havensec3 || hasnsec) { version->secure = dns_db_secure; } else { version->secure = dns_db_insecure; } } /*%< * Walk the origin node looking for NSEC3PARAM records. * Cache the nsec3 parameters. */ static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version) { dns_rbtnode_t *node; dns_rdata_nsec3param_t nsec3param; dns_rdata_t rdata = DNS_RDATA_INIT; isc_region_t region; isc_result_t result; rdatasetheader_t *header, *header_next; unsigned char *raw; /* RDATASLAB */ unsigned int count, length; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); version->havensec3 = false; node = rbtdb->origin_node; NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); for (header = node->data; header != NULL; header = header_next) { header_next = header->next; do { if (header->serial <= version->serial && !IGNORE(header)) { if (NONEXISTENT(header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL && (header->type == dns_rdatatype_nsec3param)) { /* * Find A NSEC3PARAM with a supported algorithm. */ raw = (unsigned char *)header + sizeof(*header); count = raw[0] * 256 + raw[1]; /* count */ raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH; while (count-- > 0U) { length = raw[0] * 256 + raw[1]; raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH; region.base = raw; region.length = length; raw += length; dns_rdata_fromregion( &rdata, rbtdb->common.rdclass, dns_rdatatype_nsec3param, ®ion); result = dns_rdata_tostruct(&rdata, &nsec3param, NULL); INSIST(result == ISC_R_SUCCESS); dns_rdata_reset(&rdata); if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG && !dns_nsec3_supportedhash(nsec3param.hash)) { continue; } if (nsec3param.flags != 0) { continue; } memmove(version->salt, nsec3param.salt, nsec3param.salt_length); version->hash = nsec3param.hash; version->salt_length = nsec3param.salt_length; version->iterations = nsec3param.iterations; version->flags = nsec3param.flags; version->havensec3 = true; /* * Look for a better algorithm than the * unknown test algorithm. */ if (nsec3param.hash != DNS_NSEC3_UNKNOWNALG) { goto unlock; } } } } unlock: NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); } static void cleanup_dead_nodes_callback(isc_task_t *task, isc_event_t *event) { dns_rbtdb_t *rbtdb = event->ev_arg; bool again = false; unsigned int locknum; RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); for (locknum = 0; locknum < rbtdb->node_lock_count; locknum++) { NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); cleanup_dead_nodes(rbtdb, locknum); if (ISC_LIST_HEAD(rbtdb->deadnodes[locknum]) != NULL) { again = true; } NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); } RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); if (again) { isc_task_send(task, &event); } else { isc_event_free(&event); if (isc_refcount_decrement(&rbtdb->references) == 1) { (void)isc_refcount_current(&rbtdb->references); maybe_free_rbtdb(rbtdb); } } } static void closeversion(dns_db_t *db, dns_dbversion_t **versionp, bool commit) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *version, *cleanup_version, *least_greater; bool rollback = false; rbtdb_changedlist_t cleanup_list; rdatasetheaderlist_t resigned_list; rbtdb_changed_t *changed, *next_changed; rbtdb_serial_t serial, least_serial; dns_rbtnode_t *rbtnode; rdatasetheader_t *header; REQUIRE(VALID_RBTDB(rbtdb)); version = (rbtdb_version_t *)*versionp; INSIST(version->rbtdb == rbtdb); cleanup_version = NULL; ISC_LIST_INIT(cleanup_list); ISC_LIST_INIT(resigned_list); if (isc_refcount_decrement(&version->references) > 1) { /* typical and easy case first */ if (commit) { RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); INSIST(!version->writer); RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); } goto end; } /* * Update the zone's secure status in version before making * it the current version. */ if (version->writer && commit && !IS_CACHE(rbtdb)) { iszonesecure(db, version, rbtdb->origin_node); } RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); serial = version->serial; if (version->writer) { if (commit) { unsigned cur_ref; rbtdb_version_t *cur_version; INSIST(version->commit_ok); INSIST(version == rbtdb->future_version); /* * The current version is going to be replaced. * Release the (likely last) reference to it from the * DB itself and unlink it from the open list. */ cur_version = rbtdb->current_version; cur_ref = isc_refcount_decrement( &cur_version->references); if (cur_ref == 1) { (void)isc_refcount_current( &cur_version->references); if (cur_version->serial == rbtdb->least_serial) { INSIST(EMPTY( cur_version->changed_list)); } UNLINK(rbtdb->open_versions, cur_version, link); } if (EMPTY(rbtdb->open_versions)) { /* * We're going to become the least open * version. */ make_least_version(rbtdb, version, &cleanup_list); } else { /* * Some other open version is the * least version. We can't cleanup * records that were changed in this * version because the older versions * may still be in use by an open * version. * * We can, however, discard the * changed records for things that * we've added that didn't exist in * prior versions. */ cleanup_nondirty(version, &cleanup_list); } /* * If the (soon to be former) current version * isn't being used by anyone, we can clean * it up. */ if (cur_ref == 1) { cleanup_version = cur_version; APPENDLIST(version->changed_list, cleanup_version->changed_list, link); } /* * Become the current version. */ version->writer = false; rbtdb->current_version = version; rbtdb->current_serial = version->serial; rbtdb->future_version = NULL; /* * Keep the current version in the open list, and * gain a reference for the DB itself (see the DB * creation function below). This must be the only * case where we need to increment the counter from * zero and need to use isc_refcount_increment0(). */ INSIST(isc_refcount_increment0(&version->references) == 0); PREPEND(rbtdb->open_versions, rbtdb->current_version, link); resigned_list = version->resigned_list; ISC_LIST_INIT(version->resigned_list); } else { /* * We're rolling back this transaction. */ cleanup_list = version->changed_list; ISC_LIST_INIT(version->changed_list); resigned_list = version->resigned_list; ISC_LIST_INIT(version->resigned_list); rollback = true; cleanup_version = version; rbtdb->future_version = NULL; } } else { if (version != rbtdb->current_version) { /* * There are no external or internal references * to this version and it can be cleaned up. */ cleanup_version = version; /* * Find the version with the least serial * number greater than ours. */ least_greater = PREV(version, link); if (least_greater == NULL) { least_greater = rbtdb->current_version; } INSIST(version->serial < least_greater->serial); /* * Is this the least open version? */ if (version->serial == rbtdb->least_serial) { /* * Yes. Install the new least open * version. */ make_least_version(rbtdb, least_greater, &cleanup_list); } else { /* * Add any unexecuted cleanups to * those of the least greater version. */ APPENDLIST(least_greater->changed_list, version->changed_list, link); } } else if (version->serial == rbtdb->least_serial) { INSIST(EMPTY(version->changed_list)); } UNLINK(rbtdb->open_versions, version, link); } least_serial = rbtdb->least_serial; RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (cleanup_version != NULL) { INSIST(EMPTY(cleanup_version->changed_list)); free_gluetable(cleanup_version); isc_rwlock_destroy(&cleanup_version->glue_rwlock); isc_rwlock_destroy(&cleanup_version->rwlock); isc_mem_put(rbtdb->common.mctx, cleanup_version, sizeof(*cleanup_version)); } /* * Commit/rollback re-signed headers. */ for (header = HEAD(resigned_list); header != NULL; header = HEAD(resigned_list)) { nodelock_t *lock; ISC_LIST_UNLINK(resigned_list, header, link); lock = &rbtdb->node_locks[header->node->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_write); if (rollback && !IGNORE(header)) { resign_insert(rbtdb, header->node->locknum, header); } decrement_reference(rbtdb, header->node, least_serial, isc_rwlocktype_write, isc_rwlocktype_none, false); NODE_UNLOCK(lock, isc_rwlocktype_write); } if (!EMPTY(cleanup_list)) { isc_event_t *event = NULL; isc_rwlocktype_t tlock = isc_rwlocktype_none; if (rbtdb->task != NULL) { event = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTDEADNODES, cleanup_dead_nodes_callback, rbtdb, sizeof(isc_event_t)); } if (event == NULL) { /* * We acquire a tree write lock here in order to make * sure that stale nodes will be removed in * decrement_reference(). If we didn't have the lock, * those nodes could miss the chance to be removed * until the server stops. The write lock is * expensive, but this event should be rare enough * to justify the cost. */ RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); tlock = isc_rwlocktype_write; } for (changed = HEAD(cleanup_list); changed != NULL; changed = next_changed) { nodelock_t *lock; next_changed = NEXT(changed, link); rbtnode = changed->node; lock = &rbtdb->node_locks[rbtnode->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_write); /* * This is a good opportunity to purge any dead nodes, * so use it. */ if (event == NULL) { cleanup_dead_nodes(rbtdb, rbtnode->locknum); } if (rollback) { rollback_node(rbtnode, serial); } decrement_reference(rbtdb, rbtnode, least_serial, isc_rwlocktype_write, tlock, false); NODE_UNLOCK(lock, isc_rwlocktype_write); isc_mem_put(rbtdb->common.mctx, changed, sizeof(*changed)); } if (event != NULL) { isc_refcount_increment(&rbtdb->references); isc_task_send(rbtdb->task, &event); } else { RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } } end: *versionp = NULL; } /* * Add the necessary magic for the wildcard name 'name' * to be found in 'rbtdb'. * * In order for wildcard matching to work correctly in * zone_find(), we must ensure that a node for the wildcarding * level exists in the database, and has its 'find_callback' * and 'wild' bits set. * * E.g. if the wildcard name is "*.sub.example." then we * must ensure that "sub.example." exists and is marked as * a wildcard level. * * tree_lock(write) must be held. */ static isc_result_t add_wildcard_magic(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) { isc_result_t result; dns_name_t foundname; dns_offsets_t offsets; unsigned int n; dns_rbtnode_t *node = NULL; dns_name_init(&foundname, offsets); n = dns_name_countlabels(name); INSIST(n >= 2); n--; dns_name_getlabelsequence(name, 1, n, &foundname); result = dns_rbt_addnode(rbtdb->tree, &foundname, &node); if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) { return (result); } if (result == ISC_R_SUCCESS) { node->nsec = DNS_RBT_NSEC_NORMAL; } node->find_callback = 1; if (lock) { NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write); } node->wild = 1; if (lock) { NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write); } return (ISC_R_SUCCESS); } /* * tree_lock(write) must be held. */ static isc_result_t add_empty_wildcards(dns_rbtdb_t *rbtdb, const dns_name_t *name, bool lock) { isc_result_t result; dns_name_t foundname; dns_offsets_t offsets; unsigned int n, l, i; dns_name_init(&foundname, offsets); n = dns_name_countlabels(name); l = dns_name_countlabels(&rbtdb->common.origin); i = l + 1; while (i < n) { dns_rbtnode_t *node = NULL; /* dummy */ dns_name_getlabelsequence(name, n - i, i, &foundname); if (dns_name_iswildcard(&foundname)) { result = add_wildcard_magic(rbtdb, &foundname, lock); if (result != ISC_R_SUCCESS) { return (result); } result = dns_rbt_addnode(rbtdb->tree, &foundname, &node); if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) { return (result); } if (result == ISC_R_SUCCESS) { node->nsec = DNS_RBT_NSEC_NORMAL; } } i++; } return (ISC_R_SUCCESS); } static isc_result_t findnodeintree(dns_rbtdb_t *rbtdb, dns_rbt_t *tree, const dns_name_t *name, bool create, dns_dbnode_t **nodep) { dns_rbtnode_t *node = NULL; dns_name_t nodename; isc_result_t result; isc_rwlocktype_t locktype = isc_rwlocktype_read; INSIST(tree == rbtdb->tree || tree == rbtdb->nsec3); dns_name_init(&nodename, NULL); RWLOCK(&rbtdb->tree_lock, locktype); result = dns_rbt_findnode(tree, name, NULL, &node, NULL, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result != ISC_R_SUCCESS) { RWUNLOCK(&rbtdb->tree_lock, locktype); if (!create) { if (result == DNS_R_PARTIALMATCH) { result = ISC_R_NOTFOUND; } return (result); } /* * It would be nice to try to upgrade the lock instead of * unlocking then relocking. */ locktype = isc_rwlocktype_write; RWLOCK(&rbtdb->tree_lock, locktype); node = NULL; result = dns_rbt_addnode(tree, name, &node); if (result == ISC_R_SUCCESS) { dns_rbt_namefromnode(node, &nodename); node->locknum = node->hashval % rbtdb->node_lock_count; if (tree == rbtdb->tree) { add_empty_wildcards(rbtdb, name, true); if (dns_name_iswildcard(name)) { result = add_wildcard_magic(rbtdb, name, true); if (result != ISC_R_SUCCESS) { RWUNLOCK(&rbtdb->tree_lock, locktype); return (result); } } } if (tree == rbtdb->nsec3) { node->nsec = DNS_RBT_NSEC_NSEC3; } } else if (result != ISC_R_EXISTS) { RWUNLOCK(&rbtdb->tree_lock, locktype); return (result); } } if (tree == rbtdb->nsec3) { INSIST(node->nsec == DNS_RBT_NSEC_NSEC3); } reactivate_node(rbtdb, node, locktype); RWUNLOCK(&rbtdb->tree_lock, locktype); *nodep = (dns_dbnode_t *)node; return (ISC_R_SUCCESS); } static isc_result_t findnode(dns_db_t *db, const dns_name_t *name, bool create, dns_dbnode_t **nodep) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); return (findnodeintree(rbtdb, rbtdb->tree, name, create, nodep)); } static isc_result_t findnsec3node(dns_db_t *db, const dns_name_t *name, bool create, dns_dbnode_t **nodep) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); return (findnodeintree(rbtdb, rbtdb->nsec3, name, create, nodep)); } static isc_result_t zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { rbtdb_search_t *search = arg; rdatasetheader_t *header, *header_next; rdatasetheader_t *dname_header, *sigdname_header, *ns_header; rdatasetheader_t *found; isc_result_t result; dns_rbtnode_t *onode; /* * We only want to remember the topmost zone cut, since it's the one * that counts, so we'll just continue if we've already found a * zonecut. */ if (search->zonecut != NULL) { return (DNS_R_CONTINUE); } found = NULL; result = DNS_R_CONTINUE; onode = search->rbtdb->origin_node; NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); /* * Look for an NS or DNAME rdataset active in our version. */ ns_header = NULL; dname_header = NULL; sigdname_header = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; if (header->type == dns_rdatatype_ns || header->type == dns_rdatatype_dname || header->type == RBTDB_RDATATYPE_SIGDNAME) { do { if (header->serial <= search->serial && !IGNORE(header)) { /* * Is this a "this rdataset doesn't * exist" record? */ if (NONEXISTENT(header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { if (header->type == dns_rdatatype_dname) { dname_header = header; } else if (header->type == RBTDB_RDATATYPE_SIGDNAME) { sigdname_header = header; } else if (node != onode || IS_STUB(search->rbtdb)) { /* * We've found an NS rdataset that * isn't at the origin node. We check * that they're not at the origin node, * because otherwise we'd erroneously * treat the zone top as if it were * a delegation. */ ns_header = header; } } } } /* * Did we find anything? */ if (!IS_CACHE(search->rbtdb) && !IS_STUB(search->rbtdb) && ns_header != NULL) { /* * Note that NS has precedence over DNAME if both exist * in a zone. Otherwise DNAME take precedence over NS. */ found = ns_header; search->zonecut_sigrdataset = NULL; } else if (dname_header != NULL) { found = dname_header; search->zonecut_sigrdataset = sigdname_header; } else if (ns_header != NULL) { found = ns_header; search->zonecut_sigrdataset = NULL; } if (found != NULL) { /* * We increment the reference count on node to ensure that * search->zonecut_rdataset will still be valid later. */ new_reference(search->rbtdb, node, isc_rwlocktype_read); search->zonecut = node; search->zonecut_rdataset = found; search->need_cleanup = true; /* * Since we've found a zonecut, anything beneath it is * glue and is not subject to wildcard matching, so we * may clear search->wild. */ search->wild = false; if ((search->options & DNS_DBFIND_GLUEOK) == 0) { /* * If the caller does not want to find glue, then * this is the best answer and the search should * stop now. */ result = DNS_R_PARTIALMATCH; } else { dns_name_t *zcname; /* * The search will continue beneath the zone cut. * This may or may not be the best match. In case it * is, we need to remember the node name. */ zcname = dns_fixedname_name(&search->zonecut_name); dns_name_copy(name, zcname); search->copy_name = true; } } else { /* * There is no zonecut at this node which is active in this * version. * * If this is a "wild" node and the caller hasn't disabled * wildcard matching, remember that we've seen a wild node * in case we need to go searching for wildcard matches * later on. */ if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0) { search->wild = true; } } NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); return (result); } static void bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rdatasetheader_t *header, isc_stdtime_t now, isc_rwlocktype_t locktype, dns_rdataset_t *rdataset) { unsigned char *raw; /* RDATASLAB */ bool stale = STALE(header); bool ancient = ANCIENT(header); /* * Caller must be holding the node reader lock. * XXXJT: technically, we need a writer lock, since we'll increment * the header count below. However, since the actual counter value * doesn't matter, we prioritize performance here. (We may want to * use atomic increment when available). */ if (rdataset == NULL) { return; } new_reference(rbtdb, node, locktype); INSIST(rdataset->methods == NULL); /* We must be disassociated. */ /* * Mark header stale or ancient if the RRset is no longer active. */ if (!ACTIVE(header, now)) { dns_ttl_t stale_ttl = header->rdh_ttl + STALE_TTL(header, rbtdb); /* * If this data is in the stale window keep it and if * DNS_DBFIND_STALEOK is not set we tell the caller to * skip this record. We skip the records with ZEROTTL * (these records should not be cached anyway). */ if (KEEPSTALE(rbtdb) && stale_ttl > now) { stale = true; } else { /* * We are not keeping stale, or it is outside the * stale window. Mark ancient, i.e. ready for cleanup. */ ancient = true; } } rdataset->methods = &rdataset_methods; rdataset->rdclass = rbtdb->common.rdclass; rdataset->type = RBTDB_RDATATYPE_BASE(header->type); rdataset->covers = RBTDB_RDATATYPE_EXT(header->type); rdataset->ttl = header->rdh_ttl - now; rdataset->trust = header->trust; if (NEGATIVE(header)) { rdataset->attributes |= DNS_RDATASETATTR_NEGATIVE; } if (NXDOMAIN(header)) { rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN; } if (OPTOUT(header)) { rdataset->attributes |= DNS_RDATASETATTR_OPTOUT; } if (PREFETCH(header)) { rdataset->attributes |= DNS_RDATASETATTR_PREFETCH; } if (stale && !ancient) { dns_ttl_t stale_ttl = header->rdh_ttl + STALE_TTL(header, rbtdb); if (stale_ttl > now) { rdataset->ttl = stale_ttl - now; } else { rdataset->ttl = 0; } if (STALE_WINDOW(header)) { rdataset->attributes |= DNS_RDATASETATTR_STALE_WINDOW; } rdataset->attributes |= DNS_RDATASETATTR_STALE; } else if (IS_CACHE(rbtdb) && !ACTIVE(header, now)) { rdataset->attributes |= DNS_RDATASETATTR_ANCIENT; rdataset->ttl = header->rdh_ttl; } rdataset->private1 = rbtdb; rdataset->private2 = node; raw = (unsigned char *)header + sizeof(*header); rdataset->private3 = raw; rdataset->count = atomic_fetch_add_relaxed(&header->count, 1); if (rdataset->count == UINT32_MAX) { rdataset->count = 0; } /* * Reset iterator state. */ rdataset->privateuint4 = 0; rdataset->private5 = NULL; /* * Add noqname proof. */ rdataset->private6 = header->noqname; if (rdataset->private6 != NULL) { rdataset->attributes |= DNS_RDATASETATTR_NOQNAME; } rdataset->private7 = header->closest; if (rdataset->private7 != NULL) { rdataset->attributes |= DNS_RDATASETATTR_CLOSEST; } /* * Copy out re-signing information. */ if (RESIGN(header)) { rdataset->attributes |= DNS_RDATASETATTR_RESIGN; rdataset->resign = (header->resign << 1) | header->resign_lsb; } else { rdataset->resign = 0; } } static isc_result_t setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_name_t *zcname; rbtdb_rdatatype_t type; dns_rbtnode_t *node; REQUIRE(search != NULL); REQUIRE(search->zonecut != NULL); REQUIRE(search->zonecut_rdataset != NULL); /* * The caller MUST NOT be holding any node locks. */ node = search->zonecut; type = search->zonecut_rdataset->type; /* * If we have to set foundname, we do it before anything else. * If we were to set foundname after we had set nodep or bound the * rdataset, then we'd have to undo that work if dns_name_copy() * failed. By setting foundname first, there's nothing to undo if * we have trouble. */ if (foundname != NULL && search->copy_name) { zcname = dns_fixedname_name(&search->zonecut_name); dns_name_copy(zcname, foundname); } if (nodep != NULL) { /* * Note that we don't have to increment the node's reference * count here because we're going to use the reference we * already have in the search block. */ *nodep = node; search->need_cleanup = false; } if (rdataset != NULL) { NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); bind_rdataset(search->rbtdb, node, search->zonecut_rdataset, search->now, isc_rwlocktype_read, rdataset); if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL) { bind_rdataset(search->rbtdb, node, search->zonecut_sigrdataset, search->now, isc_rwlocktype_read, sigrdataset); } NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); } if (type == dns_rdatatype_dname) { return (DNS_R_DNAME); } return (DNS_R_DELEGATION); } static bool valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type, dns_rbtnode_t *node) { unsigned char *raw; /* RDATASLAB */ unsigned int count, size; dns_name_t ns_name; bool valid = false; dns_offsets_t offsets; isc_region_t region; rdatasetheader_t *header; /* * No additional locking is required. */ /* * Valid glue types are A, AAAA, A6. NS is also a valid glue type * if it occurs at a zone cut, but is not valid below it. */ if (type == dns_rdatatype_ns) { if (node != search->zonecut) { return (false); } } else if (type != dns_rdatatype_a && type != dns_rdatatype_aaaa && type != dns_rdatatype_a6) { return (false); } header = search->zonecut_rdataset; raw = (unsigned char *)header + sizeof(*header); count = raw[0] * 256 + raw[1]; raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH; while (count > 0) { count--; size = raw[0] * 256 + raw[1]; raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH; region.base = raw; region.length = size; raw += size; /* * XXX Until we have rdata structures, we have no choice but * to directly access the rdata format. */ dns_name_init(&ns_name, offsets); dns_name_fromregion(&ns_name, ®ion); if (dns_name_compare(&ns_name, name) == 0) { valid = true; break; } } return (valid); } static bool activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain, const dns_name_t *name) { dns_fixedname_t fnext; dns_fixedname_t forigin; dns_name_t *next; dns_name_t *origin; dns_name_t prefix; dns_rbtdb_t *rbtdb; dns_rbtnode_t *node; isc_result_t result; bool answer = false; rdatasetheader_t *header; rbtdb = search->rbtdb; dns_name_init(&prefix, NULL); next = dns_fixedname_initname(&fnext); origin = dns_fixedname_initname(&forigin); result = dns_rbtnodechain_next(chain, NULL, NULL); while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { node = NULL; result = dns_rbtnodechain_current(chain, &prefix, origin, &node); if (result != ISC_R_SUCCESS) { break; } NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { if (header->serial <= search->serial && !IGNORE(header) && EXISTS(header)) { break; } } NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); if (header != NULL) { break; } result = dns_rbtnodechain_next(chain, NULL, NULL); } if (result == ISC_R_SUCCESS) { result = dns_name_concatenate(&prefix, origin, next, NULL); } if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name)) { answer = true; } return (answer); } static bool activeemptynode(rbtdb_search_t *search, const dns_name_t *qname, dns_name_t *wname) { dns_fixedname_t fnext; dns_fixedname_t forigin; dns_fixedname_t fprev; dns_name_t *next; dns_name_t *origin; dns_name_t *prev; dns_name_t name; dns_name_t rname; dns_name_t tname; dns_rbtdb_t *rbtdb; dns_rbtnode_t *node; dns_rbtnodechain_t chain; bool check_next = true; bool check_prev = true; bool answer = false; isc_result_t result; rdatasetheader_t *header; unsigned int n; rbtdb = search->rbtdb; dns_name_init(&name, NULL); dns_name_init(&tname, NULL); dns_name_init(&rname, NULL); next = dns_fixedname_initname(&fnext); prev = dns_fixedname_initname(&fprev); origin = dns_fixedname_initname(&forigin); /* * Find if qname is at or below a empty node. * Use our own copy of the chain. */ chain = search->chain; do { node = NULL; result = dns_rbtnodechain_current(&chain, &name, origin, &node); if (result != ISC_R_SUCCESS) { break; } NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { if (header->serial <= search->serial && !IGNORE(header) && EXISTS(header)) { break; } } NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); if (header != NULL) { break; } result = dns_rbtnodechain_prev(&chain, NULL, NULL); } while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN); if (result == ISC_R_SUCCESS) { result = dns_name_concatenate(&name, origin, prev, NULL); } if (result != ISC_R_SUCCESS) { check_prev = false; } result = dns_rbtnodechain_next(&chain, NULL, NULL); while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { node = NULL; result = dns_rbtnodechain_current(&chain, &name, origin, &node); if (result != ISC_R_SUCCESS) { break; } NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { if (header->serial <= search->serial && !IGNORE(header) && EXISTS(header)) { break; } } NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); if (header != NULL) { break; } result = dns_rbtnodechain_next(&chain, NULL, NULL); } if (result == ISC_R_SUCCESS) { result = dns_name_concatenate(&name, origin, next, NULL); } if (result != ISC_R_SUCCESS) { check_next = false; } dns_name_clone(qname, &rname); /* * Remove the wildcard label to find the terminal name. */ n = dns_name_countlabels(wname); dns_name_getlabelsequence(wname, 1, n - 1, &tname); do { if ((check_prev && dns_name_issubdomain(prev, &rname)) || (check_next && dns_name_issubdomain(next, &rname))) { answer = true; break; } /* * Remove the left hand label. */ n = dns_name_countlabels(&rname); dns_name_getlabelsequence(&rname, 1, n - 1, &rname); } while (!dns_name_equal(&rname, &tname)); return (answer); } static isc_result_t find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, const dns_name_t *qname) { unsigned int i, j; dns_rbtnode_t *node, *level_node, *wnode; rdatasetheader_t *header; isc_result_t result = ISC_R_NOTFOUND; dns_name_t name; dns_name_t *wname; dns_fixedname_t fwname; dns_rbtdb_t *rbtdb; bool done, wild, active; dns_rbtnodechain_t wchain; /* * Caller must be holding the tree lock and MUST NOT be holding * any node locks. */ /* * Examine each ancestor level. If the level's wild bit * is set, then construct the corresponding wildcard name and * search for it. If the wildcard node exists, and is active in * this version, we're done. If not, then we next check to see * if the ancestor is active in this version. If so, then there * can be no possible wildcard match and again we're done. If not, * continue the search. */ rbtdb = search->rbtdb; i = search->chain.level_matches; done = false; node = *nodep; do { NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); /* * First we try to figure out if this node is active in * the search's version. We do this now, even though we * may not need the information, because it simplifies the * locking and code flow. */ for (header = node->data; header != NULL; header = header->next) { if (header->serial <= search->serial && !IGNORE(header) && EXISTS(header) && !ANCIENT(header)) { break; } } if (header != NULL) { active = true; } else { active = false; } if (node->wild) { wild = true; } else { wild = false; } NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); if (wild) { /* * Construct the wildcard name for this level. */ dns_name_init(&name, NULL); dns_rbt_namefromnode(node, &name); wname = dns_fixedname_initname(&fwname); result = dns_name_concatenate(dns_wildcardname, &name, wname, NULL); j = i; while (result == ISC_R_SUCCESS && j != 0) { j--; level_node = search->chain.levels[j]; dns_name_init(&name, NULL); dns_rbt_namefromnode(level_node, &name); result = dns_name_concatenate(wname, &name, wname, NULL); } if (result != ISC_R_SUCCESS) { break; } wnode = NULL; dns_rbtnodechain_init(&wchain); result = dns_rbt_findnode( rbtdb->tree, wname, NULL, &wnode, &wchain, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result == ISC_R_SUCCESS) { nodelock_t *lock; /* * We have found the wildcard node. If it * is active in the search's version, we're * done. */ lock = &rbtdb->node_locks[wnode->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_read); for (header = wnode->data; header != NULL; header = header->next) { if (header->serial <= search->serial && !IGNORE(header) && EXISTS(header) && !ANCIENT(header)) { break; } } NODE_UNLOCK(lock, isc_rwlocktype_read); if (header != NULL || activeempty(search, &wchain, wname)) { if (activeemptynode(search, qname, wname)) { return (ISC_R_NOTFOUND); } /* * The wildcard node is active! * * Note: result is still ISC_R_SUCCESS * so we don't have to set it. */ *nodep = wnode; break; } } else if (result != ISC_R_NOTFOUND && result != DNS_R_PARTIALMATCH) { /* * An error has occurred. Bail out. */ break; } } if (active) { /* * The level node is active. Any wildcarding * present at higher levels has no * effect and we're done. */ result = ISC_R_NOTFOUND; break; } if (i > 0) { i--; node = search->chain.levels[i]; } else { done = true; } } while (!done); return (result); } static bool matchparams(rdatasetheader_t *header, rbtdb_search_t *search) { dns_rdata_t rdata = DNS_RDATA_INIT; dns_rdata_nsec3_t nsec3; unsigned char *raw; /* RDATASLAB */ unsigned int rdlen, count; isc_region_t region; isc_result_t result; REQUIRE(header->type == dns_rdatatype_nsec3); raw = (unsigned char *)header + sizeof(*header); count = raw[0] * 256 + raw[1]; /* count */ raw += DNS_RDATASET_COUNT + DNS_RDATASET_LENGTH; while (count-- > 0) { rdlen = raw[0] * 256 + raw[1]; raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH; region.base = raw; region.length = rdlen; dns_rdata_fromregion(&rdata, search->rbtdb->common.rdclass, dns_rdatatype_nsec3, ®ion); raw += rdlen; result = dns_rdata_tostruct(&rdata, &nsec3, NULL); INSIST(result == ISC_R_SUCCESS); if (nsec3.hash == search->rbtversion->hash && nsec3.iterations == search->rbtversion->iterations && nsec3.salt_length == search->rbtversion->salt_length && memcmp(nsec3.salt, search->rbtversion->salt, nsec3.salt_length) == 0) { return (true); } dns_rdata_reset(&rdata); } return (false); } /* * Find node of the NSEC/NSEC3 record that is 'name'. */ static isc_result_t previous_closest_nsec(dns_rdatatype_t type, rbtdb_search_t *search, dns_name_t *name, dns_name_t *origin, dns_rbtnode_t **nodep, dns_rbtnodechain_t *nsecchain, bool *firstp) { dns_fixedname_t ftarget; dns_name_t *target; dns_rbtnode_t *nsecnode; isc_result_t result; REQUIRE(nodep != NULL && *nodep == NULL); REQUIRE(type == dns_rdatatype_nsec3 || firstp != NULL); if (type == dns_rdatatype_nsec3) { result = dns_rbtnodechain_prev(&search->chain, NULL, NULL); if (result != ISC_R_SUCCESS && result != DNS_R_NEWORIGIN) { return (result); } result = dns_rbtnodechain_current(&search->chain, name, origin, nodep); return (result); } target = dns_fixedname_initname(&ftarget); for (;;) { if (*firstp) { /* * Construct the name of the second node to check. * It is the first node sought in the NSEC tree. */ *firstp = false; dns_rbtnodechain_init(nsecchain); result = dns_name_concatenate(name, origin, target, NULL); if (result != ISC_R_SUCCESS) { return (result); } nsecnode = NULL; result = dns_rbt_findnode( search->rbtdb->nsec, target, NULL, &nsecnode, nsecchain, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result == ISC_R_SUCCESS) { /* * Since this was the first loop, finding the * name in the NSEC tree implies that the first * node checked in the main tree had an * unacceptable NSEC record. * Try the previous node in the NSEC tree. */ result = dns_rbtnodechain_prev(nsecchain, name, origin); if (result == DNS_R_NEWORIGIN) { result = ISC_R_SUCCESS; } } else if (result == ISC_R_NOTFOUND || result == DNS_R_PARTIALMATCH) { result = dns_rbtnodechain_current( nsecchain, name, origin, NULL); if (result == ISC_R_NOTFOUND) { result = ISC_R_NOMORE; } } } else { /* * This is a second or later trip through the auxiliary * tree for the name of a third or earlier NSEC node in * the main tree. Previous trips through the NSEC tree * must have found nodes in the main tree with NSEC * records. Perhaps they lacked signature records. */ result = dns_rbtnodechain_prev(nsecchain, name, origin); if (result == DNS_R_NEWORIGIN) { result = ISC_R_SUCCESS; } } if (result != ISC_R_SUCCESS) { return (result); } /* * Construct the name to seek in the main tree. */ result = dns_name_concatenate(name, origin, target, NULL); if (result != ISC_R_SUCCESS) { return (result); } *nodep = NULL; result = dns_rbt_findnode(search->rbtdb->tree, target, NULL, nodep, &search->chain, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result == ISC_R_SUCCESS) { return (result); } /* * There should always be a node in the main tree with the * same name as the node in the auxiliary NSEC tree, except for * nodes in the auxiliary tree that are awaiting deletion. */ if (result != DNS_R_PARTIALMATCH && result != ISC_R_NOTFOUND) { isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_ERROR, "previous_closest_nsec(): %s", isc_result_totext(result)); return (DNS_R_BADDB); } } } /* * Find the NSEC/NSEC3 which is or before the current point on the * search chain. For NSEC3 records only NSEC3 records that match the * current NSEC3PARAM record are considered. */ static isc_result_t find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset, dns_rbt_t *tree, dns_db_secure_t secure) { dns_rbtnode_t *node, *prevnode; rdatasetheader_t *header, *header_next, *found, *foundsig; dns_rbtnodechain_t nsecchain; bool empty_node; isc_result_t result; dns_fixedname_t fname, forigin; dns_name_t *name, *origin; dns_rdatatype_t type; rbtdb_rdatatype_t sigtype; bool wraps; bool first = true; bool need_sig = (secure == dns_db_secure); if (tree == search->rbtdb->nsec3) { type = dns_rdatatype_nsec3; sigtype = RBTDB_RDATATYPE_SIGNSEC3; wraps = true; } else { type = dns_rdatatype_nsec; sigtype = RBTDB_RDATATYPE_SIGNSEC; wraps = false; } /* * Use the auxiliary tree only starting with the second node in the * hope that the original node will be right much of the time. */ name = dns_fixedname_initname(&fname); origin = dns_fixedname_initname(&forigin); again: node = NULL; prevnode = NULL; result = dns_rbtnodechain_current(&search->chain, name, origin, &node); if (result != ISC_R_SUCCESS) { return (result); } do { NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); found = NULL; foundsig = NULL; empty_node = true; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; /* * Look for an active, extant NSEC or RRSIG NSEC. */ do { if (header->serial <= search->serial && !IGNORE(header)) { /* * Is this a "this rdataset doesn't * exist" record? */ if (NONEXISTENT(header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { /* * We now know that there is at least one * active rdataset at this node. */ empty_node = false; if (header->type == type) { found = header; if (foundsig != NULL) { break; } } else if (header->type == sigtype) { foundsig = header; if (found != NULL) { break; } } } } if (!empty_node) { if (found != NULL && search->rbtversion->havensec3 && found->type == dns_rdatatype_nsec3 && !matchparams(found, search)) { empty_node = true; found = NULL; foundsig = NULL; result = previous_closest_nsec( type, search, name, origin, &prevnode, NULL, NULL); } else if (found != NULL && (foundsig != NULL || !need_sig)) { /* * We've found the right NSEC/NSEC3 record. * * Note: for this to really be the right * NSEC record, it's essential that the NSEC * records of any nodes obscured by a zone * cut have been removed; we assume this is * the case. */ result = dns_name_concatenate(name, origin, foundname, NULL); if (result == ISC_R_SUCCESS) { if (nodep != NULL) { new_reference( search->rbtdb, node, isc_rwlocktype_read); *nodep = node; } bind_rdataset(search->rbtdb, node, found, search->now, isc_rwlocktype_read, rdataset); if (foundsig != NULL) { bind_rdataset( search->rbtdb, node, foundsig, search->now, isc_rwlocktype_read, sigrdataset); } } } else if (found == NULL && foundsig == NULL) { /* * This node is active, but has no NSEC or * RRSIG NSEC. That means it's glue or * other obscured zone data that isn't * relevant for our search. Treat the * node as if it were empty and keep looking. */ empty_node = true; result = previous_closest_nsec( type, search, name, origin, &prevnode, &nsecchain, &first); } else { /* * We found an active node, but either the * NSEC or the RRSIG NSEC is missing. This * shouldn't happen. */ result = DNS_R_BADDB; } } else { /* * This node isn't active. We've got to keep * looking. */ result = previous_closest_nsec(type, search, name, origin, &prevnode, &nsecchain, &first); } NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), isc_rwlocktype_read); node = prevnode; prevnode = NULL; } while (empty_node && result == ISC_R_SUCCESS); if (!first) { dns_rbtnodechain_invalidate(&nsecchain); } if (result == ISC_R_NOMORE && wraps) { result = dns_rbtnodechain_last(&search->chain, tree, NULL, NULL); if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { wraps = false; goto again; } } /* * If the result is ISC_R_NOMORE, then we got to the beginning of * the database and didn't find a NSEC record. This shouldn't * happen. */ if (result == ISC_R_NOMORE) { result = DNS_R_BADDB; } return (result); } static isc_result_t zone_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version, dns_rdatatype_t type, unsigned int options, isc_stdtime_t now, dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_rbtnode_t *node = NULL; isc_result_t result; rbtdb_search_t search; bool cname_ok = true; bool close_version = false; bool maybe_zonecut = false; bool at_zonecut = false; bool wild; bool empty_node; rdatasetheader_t *header, *header_next, *found, *nsecheader; rdatasetheader_t *foundsig, *cnamesig, *nsecsig; rbtdb_rdatatype_t sigtype; bool active; nodelock_t *lock; dns_rbt_t *tree; search.rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(search.rbtdb)); INSIST(version == NULL || ((rbtdb_version_t *)version)->rbtdb == (dns_rbtdb_t *)db); /* * We don't care about 'now'. */ UNUSED(now); /* * If the caller didn't supply a version, attach to the current * version. */ if (version == NULL) { currentversion(db, &version); close_version = true; } search.rbtversion = version; search.serial = search.rbtversion->serial; search.options = options; search.copy_name = false; search.need_cleanup = false; search.wild = false; search.zonecut = NULL; dns_fixedname_init(&search.zonecut_name); dns_rbtnodechain_init(&search.chain); search.now = 0; /* * 'wild' will be true iff. we've matched a wildcard. */ wild = false; RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); /* * Search down from the root of the tree. If, while going down, we * encounter a callback node, zone_zonecut_callback() will search the * rdatasets at the zone cut for active DNAME or NS rdatasets. */ tree = (options & DNS_DBFIND_FORCENSEC3) != 0 ? search.rbtdb->nsec3 : search.rbtdb->tree; result = dns_rbt_findnode(tree, name, foundname, &node, &search.chain, DNS_RBTFIND_EMPTYDATA, zone_zonecut_callback, &search); if (result == DNS_R_PARTIALMATCH) { partial_match: if (search.zonecut != NULL) { result = setup_delegation(&search, nodep, foundname, rdataset, sigrdataset); goto tree_exit; } if (search.wild) { /* * At least one of the levels in the search chain * potentially has a wildcard. For each such level, * we must see if there's a matching wildcard active * in the current version. */ result = find_wildcard(&search, &node, name); if (result == ISC_R_SUCCESS) { dns_name_copy(name, foundname); wild = true; goto found; } else if (result != ISC_R_NOTFOUND) { goto tree_exit; } } active = false; if ((options & DNS_DBFIND_FORCENSEC3) == 0) { /* * The NSEC3 tree won't have empty nodes, * so it isn't necessary to check for them. */ dns_rbtnodechain_t chain = search.chain; active = activeempty(&search, &chain, name); } /* * If we're here, then the name does not exist, is not * beneath a zonecut, and there's no matching wildcard. */ if ((search.rbtversion->secure == dns_db_secure && !search.rbtversion->havensec3) || (search.options & DNS_DBFIND_FORCENSEC) != 0 || (search.options & DNS_DBFIND_FORCENSEC3) != 0) { result = find_closest_nsec(&search, nodep, foundname, rdataset, sigrdataset, tree, search.rbtversion->secure); if (result == ISC_R_SUCCESS) { result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN; } } else { result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN; } goto tree_exit; } else if (result != ISC_R_SUCCESS) { goto tree_exit; } found: /* * We have found a node whose name is the desired name, or we * have matched a wildcard. */ if (search.zonecut != NULL) { /* * If we're beneath a zone cut, we don't want to look for * CNAMEs because they're not legitimate zone glue. */ cname_ok = false; } else { /* * The node may be a zone cut itself. If it might be one, * make sure we check for it later. * * DS records live above the zone cut in ordinary zone so * we want to ignore any referral. * * Stub zones don't have anything "above" the delegation so * we always return a referral. */ if (node->find_callback && ((node != search.rbtdb->origin_node && !dns_rdatatype_atparent(type)) || IS_STUB(search.rbtdb))) { maybe_zonecut = true; } } /* * Certain DNSSEC types are not subject to CNAME matching * (RFC4035, section 2.5 and RFC3007). * * We don't check for RRSIG, because we don't store RRSIG records * directly. */ if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) { cname_ok = false; } /* * We now go looking for rdata... */ lock = &search.rbtdb->node_locks[node->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_read); found = NULL; foundsig = NULL; sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type); nsecheader = NULL; nsecsig = NULL; cnamesig = NULL; empty_node = true; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; /* * Look for an active, extant rdataset. */ do { if (header->serial <= search.serial && !IGNORE(header)) { /* * Is this a "this rdataset doesn't * exist" record? */ if (NONEXISTENT(header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { /* * We now know that there is at least one active * rdataset at this node. */ empty_node = false; /* * Do special zone cut handling, if requested. */ if (maybe_zonecut && header->type == dns_rdatatype_ns) { /* * We increment the reference count on node to * ensure that search->zonecut_rdataset will * still be valid later. */ new_reference(search.rbtdb, node, isc_rwlocktype_read); search.zonecut = node; search.zonecut_rdataset = header; search.zonecut_sigrdataset = NULL; search.need_cleanup = true; maybe_zonecut = false; at_zonecut = true; /* * It is not clear if KEY should still be * allowed at the parent side of the zone * cut or not. It is needed for RFC3007 * validated updates. */ if ((search.options & DNS_DBFIND_GLUEOK) == 0 && type != dns_rdatatype_nsec && type != dns_rdatatype_key) { /* * Glue is not OK, but any answer we * could return would be glue. Return * the delegation. */ found = NULL; break; } if (found != NULL && foundsig != NULL) { break; } } /* * If the NSEC3 record doesn't match the chain * we are using behave as if it isn't here. */ if (header->type == dns_rdatatype_nsec3 && !matchparams(header, &search)) { NODE_UNLOCK(lock, isc_rwlocktype_read); goto partial_match; } /* * If we found a type we were looking for, * remember it. */ if (header->type == type || type == dns_rdatatype_any || (header->type == dns_rdatatype_cname && cname_ok)) { /* * We've found the answer! */ found = header; if (header->type == dns_rdatatype_cname && cname_ok) { /* * We may be finding a CNAME instead * of the desired type. * * If we've already got the CNAME RRSIG, * use it, otherwise change sigtype * so that we find it. */ if (cnamesig != NULL) { foundsig = cnamesig; } else { sigtype = RBTDB_RDATATYPE_SIGCNAME; } } /* * If we've got all we need, end the search. */ if (!maybe_zonecut && foundsig != NULL) { break; } } else if (header->type == sigtype) { /* * We've found the RRSIG rdataset for our * target type. Remember it. */ foundsig = header; /* * If we've got all we need, end the search. */ if (!maybe_zonecut && found != NULL) { break; } } else if (header->type == dns_rdatatype_nsec && !search.rbtversion->havensec3) { /* * Remember a NSEC rdataset even if we're * not specifically looking for it, because * we might need it later. */ nsecheader = header; } else if (header->type == RBTDB_RDATATYPE_SIGNSEC && !search.rbtversion->havensec3) { /* * If we need the NSEC rdataset, we'll also * need its signature. */ nsecsig = header; } else if (cname_ok && header->type == RBTDB_RDATATYPE_SIGCNAME) { /* * If we get a CNAME match, we'll also need * its signature. */ cnamesig = header; } } } if (empty_node) { /* * We have an exact match for the name, but there are no * active rdatasets in the desired version. That means that * this node doesn't exist in the desired version, and that * we really have a partial match. */ if (!wild) { NODE_UNLOCK(lock, isc_rwlocktype_read); goto partial_match; } } /* * If we didn't find what we were looking for... */ if (found == NULL) { if (search.zonecut != NULL) { /* * We were trying to find glue at a node beneath a * zone cut, but didn't. * * Return the delegation. */ NODE_UNLOCK(lock, isc_rwlocktype_read); result = setup_delegation(&search, nodep, foundname, rdataset, sigrdataset); goto tree_exit; } /* * The desired type doesn't exist. */ result = DNS_R_NXRRSET; if (search.rbtversion->secure == dns_db_secure && !search.rbtversion->havensec3 && (nsecheader == NULL || nsecsig == NULL)) { /* * The zone is secure but there's no NSEC, * or the NSEC has no signature! */ if (!wild) { result = DNS_R_BADDB; goto node_exit; } NODE_UNLOCK(lock, isc_rwlocktype_read); result = find_closest_nsec(&search, nodep, foundname, rdataset, sigrdataset, search.rbtdb->tree, search.rbtversion->secure); if (result == ISC_R_SUCCESS) { result = DNS_R_EMPTYWILD; } goto tree_exit; } if ((search.options & DNS_DBFIND_FORCENSEC) != 0 && nsecheader == NULL) { /* * There's no NSEC record, and we were told * to find one. */ result = DNS_R_BADDB; goto node_exit; } if (nodep != NULL) { new_reference(search.rbtdb, node, isc_rwlocktype_read); *nodep = node; } if ((search.rbtversion->secure == dns_db_secure && !search.rbtversion->havensec3) || (search.options & DNS_DBFIND_FORCENSEC) != 0) { bind_rdataset(search.rbtdb, node, nsecheader, 0, isc_rwlocktype_read, rdataset); if (nsecsig != NULL) { bind_rdataset(search.rbtdb, node, nsecsig, 0, isc_rwlocktype_read, sigrdataset); } } if (wild) { foundname->attributes |= DNS_NAMEATTR_WILDCARD; } goto node_exit; } /* * We found what we were looking for, or we found a CNAME. */ if (type != found->type && type != dns_rdatatype_any && found->type == dns_rdatatype_cname) { /* * We weren't doing an ANY query and we found a CNAME instead * of the type we were looking for, so we need to indicate * that result to the caller. */ result = DNS_R_CNAME; } else if (search.zonecut != NULL) { /* * If we're beneath a zone cut, we must indicate that the * result is glue, unless we're actually at the zone cut * and the type is NSEC or KEY. */ if (search.zonecut == node) { /* * It is not clear if KEY should still be * allowed at the parent side of the zone * cut or not. It is needed for RFC3007 * validated updates. */ if (type == dns_rdatatype_nsec || type == dns_rdatatype_nsec3 || type == dns_rdatatype_key) { result = ISC_R_SUCCESS; } else if (type == dns_rdatatype_any) { result = DNS_R_ZONECUT; } else { result = DNS_R_GLUE; } } else { result = DNS_R_GLUE; } /* * We might have found data that isn't glue, but was occluded * by a dynamic update. If the caller cares about this, they * will have told us to validate glue. * * XXX We should cache the glue validity state! */ if (result == DNS_R_GLUE && (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 && !valid_glue(&search, foundname, type, node)) { NODE_UNLOCK(lock, isc_rwlocktype_read); result = setup_delegation(&search, nodep, foundname, rdataset, sigrdataset); goto tree_exit; } } else { /* * An ordinary successful query! */ result = ISC_R_SUCCESS; } if (nodep != NULL) { if (!at_zonecut) { new_reference(search.rbtdb, node, isc_rwlocktype_read); } else { search.need_cleanup = false; } *nodep = node; } if (type != dns_rdatatype_any) { bind_rdataset(search.rbtdb, node, found, 0, isc_rwlocktype_read, rdataset); if (foundsig != NULL) { bind_rdataset(search.rbtdb, node, foundsig, 0, isc_rwlocktype_read, sigrdataset); } } if (wild) { foundname->attributes |= DNS_NAMEATTR_WILDCARD; } node_exit: NODE_UNLOCK(lock, isc_rwlocktype_read); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); /* * If we found a zonecut but aren't going to use it, we have to * let go of it. */ if (search.need_cleanup) { node = search.zonecut; INSIST(node != NULL); lock = &(search.rbtdb->node_locks[node->locknum].lock); NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read, isc_rwlocktype_none, false); NODE_UNLOCK(lock, isc_rwlocktype_read); } if (close_version) { closeversion(db, &version, false); } dns_rbtnodechain_reset(&search.chain); return (result); } static isc_result_t zone_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options, isc_stdtime_t now, dns_dbnode_t **nodep, dns_name_t *foundname, dns_name_t *dcname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { UNUSED(db); UNUSED(name); UNUSED(options); UNUSED(now); UNUSED(nodep); UNUSED(foundname); UNUSED(dcname); UNUSED(rdataset); UNUSED(sigrdataset); FATAL_ERROR("zone_findzonecut() called!"); UNREACHABLE(); return (ISC_R_NOTIMPLEMENTED); } static bool check_stale_header(dns_rbtnode_t *node, rdatasetheader_t *header, isc_rwlocktype_t *locktype, nodelock_t *lock, rbtdb_search_t *search, rdatasetheader_t **header_prev) { if (!ACTIVE(header, search->now)) { dns_ttl_t stale = header->rdh_ttl + STALE_TTL(header, search->rbtdb); /* * If this data is in the stale window keep it and if * DNS_DBFIND_STALEOK is not set we tell the caller to * skip this record. We skip the records with ZEROTTL * (these records should not be cached anyway). */ RDATASET_ATTR_CLR(header, RDATASET_ATTR_STALE_WINDOW); if (!ZEROTTL(header) && KEEPSTALE(search->rbtdb) && stale > search->now) { mark_header_stale(search->rbtdb, header); *header_prev = header; /* * If DNS_DBFIND_STALESTART is set then it means we * failed to resolve the name during recursion, in * this case we mark the time in which the refresh * failed. */ if ((search->options & DNS_DBFIND_STALESTART) != 0) { atomic_store_release( &header->last_refresh_fail_ts, search->now); } else if ((search->options & DNS_DBFIND_STALEENABLED) != 0 && search->now < (atomic_load_acquire( &header->last_refresh_fail_ts) + search->rbtdb->serve_stale_refresh)) { /* * If we are within interval between last * refresh failure time + 'stale-refresh-time', * then don't skip this stale entry but use it * instead. */ RDATASET_ATTR_SET(header, RDATASET_ATTR_STALE_WINDOW); return (false); } else if ((search->options & DNS_DBFIND_STALETIMEOUT) != 0) { /* * We want stale RRset due to timeout, so we * don't skip it. */ return (false); } return ((search->options & DNS_DBFIND_STALEOK) == 0); } /* * This rdataset is stale. If no one else is using the * node, we can clean it up right now, otherwise we mark * it as ancient, and the node as dirty, so it will get * cleaned up later. */ if ((header->rdh_ttl < search->now - RBTDB_VIRTUAL) && (*locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { /* * We update the node's status only when we can * get write access; otherwise, we leave others * to this work. Periodical cleaning will * eventually take the job as the last resort. * We won't downgrade the lock, since other * rdatasets are probably stale, too. */ *locktype = isc_rwlocktype_write; if (isc_refcount_current(&node->references) == 0) { isc_mem_t *mctx; /* * header->down can be non-NULL if the * refcount has just decremented to 0 * but decrement_reference() has not * performed clean_cache_node(), in * which case we need to purge the stale * headers first. */ mctx = search->rbtdb->common.mctx; clean_stale_headers(search->rbtdb, mctx, header); if (*header_prev != NULL) { (*header_prev)->next = header->next; } else { node->data = header->next; } free_rdataset(search->rbtdb, mctx, header); } else { mark_header_ancient(search->rbtdb, header); *header_prev = header; } } else { *header_prev = header; } return (true); } return (false); } static isc_result_t cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { rbtdb_search_t *search = arg; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *dname_header, *sigdname_header; isc_result_t result; nodelock_t *lock; isc_rwlocktype_t locktype; /* XXX comment */ REQUIRE(search->zonecut == NULL); /* * Keep compiler silent. */ UNUSED(name); lock = &(search->rbtdb->node_locks[node->locknum].lock); locktype = isc_rwlocktype_read; NODE_LOCK(lock, locktype); /* * Look for a DNAME or RRSIG DNAME rdataset. */ dname_header = NULL; sigdname_header = NULL; header_prev = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; if (check_stale_header(node, header, &locktype, lock, search, &header_prev)) { /* Do nothing. */ } else if (header->type == dns_rdatatype_dname && EXISTS(header) && !ANCIENT(header)) { dname_header = header; header_prev = header; } else if (header->type == RBTDB_RDATATYPE_SIGDNAME && EXISTS(header) && !ANCIENT(header)) { sigdname_header = header; header_prev = header; } else { header_prev = header; } } if (dname_header != NULL && (!DNS_TRUST_PENDING(dname_header->trust) || (search->options & DNS_DBFIND_PENDINGOK) != 0)) { /* * We increment the reference count on node to ensure that * search->zonecut_rdataset will still be valid later. */ new_reference(search->rbtdb, node, locktype); search->zonecut = node; search->zonecut_rdataset = dname_header; search->zonecut_sigrdataset = sigdname_header; search->need_cleanup = true; result = DNS_R_PARTIALMATCH; } else { result = DNS_R_CONTINUE; } NODE_UNLOCK(lock, locktype); return (result); } static isc_result_t find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { unsigned int i; dns_rbtnode_t *level_node; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *foundsig; isc_result_t result = ISC_R_NOTFOUND; dns_name_t name; dns_rbtdb_t *rbtdb; bool done; nodelock_t *lock; isc_rwlocktype_t locktype; /* * Caller must be holding the tree lock. */ rbtdb = search->rbtdb; i = search->chain.level_matches; done = false; do { locktype = isc_rwlocktype_read; lock = &rbtdb->node_locks[node->locknum].lock; NODE_LOCK(lock, locktype); /* * Look for NS and RRSIG NS rdatasets. */ found = NULL; foundsig = NULL; header_prev = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; if (check_stale_header(node, header, &locktype, lock, search, &header_prev)) { /* Do nothing. */ } else if (EXISTS(header) && !ANCIENT(header)) { /* * We've found an extant rdataset. See if * we're interested in it. */ if (header->type == dns_rdatatype_ns) { found = header; if (foundsig != NULL) { break; } } else if (header->type == RBTDB_RDATATYPE_SIGNS) { foundsig = header; if (found != NULL) { break; } } header_prev = header; } else { header_prev = header; } } if (found != NULL) { /* * If we have to set foundname, we do it before * anything else. If we were to set foundname after * we had set nodep or bound the rdataset, then we'd * have to undo that work if dns_name_concatenate() * failed. By setting foundname first, there's * nothing to undo if we have trouble. */ if (foundname != NULL) { dns_name_init(&name, NULL); dns_rbt_namefromnode(node, &name); dns_name_copy(&name, foundname); while (i > 0) { i--; level_node = search->chain.levels[i]; dns_name_init(&name, NULL); dns_rbt_namefromnode(level_node, &name); result = dns_name_concatenate( foundname, &name, foundname, NULL); if (result != ISC_R_SUCCESS) { if (nodep != NULL) { *nodep = NULL; } goto node_exit; } } } result = DNS_R_DELEGATION; if (nodep != NULL) { new_reference(search->rbtdb, node, locktype); *nodep = node; } bind_rdataset(search->rbtdb, node, found, search->now, locktype, rdataset); if (foundsig != NULL) { bind_rdataset(search->rbtdb, node, foundsig, search->now, locktype, sigrdataset); } if (need_headerupdate(found, search->now) || (foundsig != NULL && need_headerupdate(foundsig, search->now))) { if (locktype != isc_rwlocktype_write) { NODE_UNLOCK(lock, locktype); NODE_LOCK(lock, isc_rwlocktype_write); locktype = isc_rwlocktype_write; POST(locktype); } if (need_headerupdate(found, search->now)) { update_header(search->rbtdb, found, search->now); } if (foundsig != NULL && need_headerupdate(foundsig, search->now)) { update_header(search->rbtdb, foundsig, search->now); } } } node_exit: NODE_UNLOCK(lock, locktype); if (found == NULL && i > 0) { i--; node = search->chain.levels[i]; } else { done = true; } } while (!done); return (result); } /* * Look for a potentially covering NSEC in the cache where `name` * is known not to exist. This uses the auxiliary NSEC tree to find * the potential NSEC owner. If found, we update 'foundname', 'nodep', * 'rdataset' and 'sigrdataset', and return DNS_R_COVERINGNSEC. * Otherwise, return ISC_R_NOTFOUND. */ static isc_result_t find_coveringnsec(rbtdb_search_t *search, const dns_name_t *name, dns_dbnode_t **nodep, isc_stdtime_t now, dns_name_t *foundname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_fixedname_t fprefix, forigin, ftarget, fixed; dns_name_t *prefix = NULL, *origin = NULL; dns_name_t *target = NULL, *fname = NULL; dns_rbtnode_t *node = NULL; dns_rbtnodechain_t chain; isc_result_t result; isc_rwlocktype_t locktype; nodelock_t *lock = NULL; rbtdb_rdatatype_t matchtype, sigmatchtype; rdatasetheader_t *found = NULL, *foundsig = NULL; rdatasetheader_t *header = NULL; rdatasetheader_t *header_next = NULL, *header_prev = NULL; /* * Look for the node in the auxilary tree. */ dns_rbtnodechain_init(&chain); target = dns_fixedname_initname(&ftarget); result = dns_rbt_findnode(search->rbtdb->nsec, name, target, &node, &chain, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result != DNS_R_PARTIALMATCH) { dns_rbtnodechain_reset(&chain); return (ISC_R_NOTFOUND); } prefix = dns_fixedname_initname(&fprefix); origin = dns_fixedname_initname(&forigin); target = dns_fixedname_initname(&ftarget); fname = dns_fixedname_initname(&fixed); locktype = isc_rwlocktype_read; matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0); sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec); /* * Extract predecessor from chain. */ result = dns_rbtnodechain_current(&chain, prefix, origin, NULL); dns_rbtnodechain_reset(&chain); if (result != ISC_R_SUCCESS && result != DNS_R_NEWORIGIN) { return (ISC_R_NOTFOUND); } result = dns_name_concatenate(prefix, origin, target, NULL); if (result != ISC_R_SUCCESS) { return (ISC_R_NOTFOUND); } /* * Lookup the predecessor in the main tree. */ node = NULL; result = dns_rbt_findnode(search->rbtdb->tree, target, fname, &node, NULL, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result != ISC_R_SUCCESS) { return (ISC_R_NOTFOUND); } lock = &(search->rbtdb->node_locks[node->locknum].lock); NODE_LOCK(lock, locktype); for (header = node->data; header != NULL; header = header_next) { header_next = header->next; if (check_stale_header(node, header, &locktype, lock, search, &header_prev)) { continue; } if (NONEXISTENT(header) || RBTDB_RDATATYPE_BASE(header->type) == 0) { header_prev = header; continue; } if (header->type == matchtype) { found = header; if (foundsig != NULL) { break; } } else if (header->type == sigmatchtype) { foundsig = header; if (found != NULL) { break; } } header_prev = header; } if (found != NULL) { bind_rdataset(search->rbtdb, node, found, now, locktype, rdataset); if (foundsig != NULL) { bind_rdataset(search->rbtdb, node, foundsig, now, locktype, sigrdataset); } new_reference(search->rbtdb, node, locktype); dns_name_copy(fname, foundname); *nodep = node; result = DNS_R_COVERINGNSEC; } else { result = ISC_R_NOTFOUND; } NODE_UNLOCK(lock, locktype); return (result); } static isc_result_t cache_find(dns_db_t *db, const dns_name_t *name, dns_dbversion_t *version, dns_rdatatype_t type, unsigned int options, isc_stdtime_t now, dns_dbnode_t **nodep, dns_name_t *foundname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_rbtnode_t *node = NULL; isc_result_t result; rbtdb_search_t search; bool cname_ok = true; bool found_noqname = false; bool all_negative = true; bool empty_node; nodelock_t *lock; isc_rwlocktype_t locktype; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *nsheader; rdatasetheader_t *foundsig, *nssig, *cnamesig; rdatasetheader_t *update, *updatesig; rdatasetheader_t *nsecheader, *nsecsig; rbtdb_rdatatype_t sigtype, negtype; UNUSED(version); search.rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(search.rbtdb)); REQUIRE(version == NULL); if (now == 0) { isc_stdtime_get(&now); } search.rbtversion = NULL; search.serial = 1; search.options = options; search.copy_name = false; search.need_cleanup = false; search.wild = false; search.zonecut = NULL; search.zonecut_rdataset = NULL; search.zonecut_sigrdataset = NULL; dns_fixedname_init(&search.zonecut_name); dns_rbtnodechain_init(&search.chain); search.now = now; update = NULL; updatesig = NULL; RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); /* * Search down from the root of the tree. If, while going down, we * encounter a callback node, cache_zonecut_callback() will search the * rdatasets at the zone cut for a DNAME rdataset. */ result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node, &search.chain, DNS_RBTFIND_EMPTYDATA, cache_zonecut_callback, &search); if (result == DNS_R_PARTIALMATCH) { /* * If dns_rbt_findnode discovered a covering DNAME skip * looking for a covering NSEC. */ if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 && (search.zonecut_rdataset == NULL || search.zonecut_rdataset->type != dns_rdatatype_dname)) { result = find_coveringnsec(&search, name, nodep, now, foundname, rdataset, sigrdataset); if (result == DNS_R_COVERINGNSEC) { goto tree_exit; } } if (search.zonecut != NULL) { result = setup_delegation(&search, nodep, foundname, rdataset, sigrdataset); goto tree_exit; } else { find_ns: result = find_deepest_zonecut(&search, node, nodep, foundname, rdataset, sigrdataset); goto tree_exit; } } else if (result != ISC_R_SUCCESS) { goto tree_exit; } /* * Certain DNSSEC types are not subject to CNAME matching * (RFC4035, section 2.5 and RFC3007). * * We don't check for RRSIG, because we don't store RRSIG records * directly. */ if (type == dns_rdatatype_key || type == dns_rdatatype_nsec) { cname_ok = false; } /* * We now go looking for rdata... */ lock = &(search.rbtdb->node_locks[node->locknum].lock); locktype = isc_rwlocktype_read; NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type); negtype = RBTDB_RDATATYPE_VALUE(0, type); nsheader = NULL; nsecheader = NULL; nssig = NULL; nsecsig = NULL; cnamesig = NULL; empty_node = true; header_prev = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; if (check_stale_header(node, header, &locktype, lock, &search, &header_prev)) { /* Do nothing. */ } else if (EXISTS(header) && !ANCIENT(header)) { /* * We now know that there is at least one active * non-stale rdataset at this node. */ empty_node = false; if (header->noqname != NULL && header->trust == dns_trust_secure) { found_noqname = true; } if (!NEGATIVE(header)) { all_negative = false; } /* * If we found a type we were looking for, remember * it. */ if (header->type == type || (type == dns_rdatatype_any && RBTDB_RDATATYPE_BASE(header->type) != 0) || (cname_ok && header->type == dns_rdatatype_cname)) { /* * We've found the answer. */ found = header; if (header->type == dns_rdatatype_cname && cname_ok && cnamesig != NULL) { /* * If we've already got the * CNAME RRSIG, use it. */ foundsig = cnamesig; } } else if (header->type == sigtype) { /* * We've found the RRSIG rdataset for our * target type. Remember it. */ foundsig = header; } else if (header->type == RBTDB_RDATATYPE_NCACHEANY || header->type == negtype) { /* * We've found a negative cache entry. */ found = header; } else if (header->type == dns_rdatatype_ns) { /* * Remember a NS rdataset even if we're * not specifically looking for it, because * we might need it later. */ nsheader = header; } else if (header->type == RBTDB_RDATATYPE_SIGNS) { /* * If we need the NS rdataset, we'll also * need its signature. */ nssig = header; } else if (header->type == dns_rdatatype_nsec) { nsecheader = header; } else if (header->type == RBTDB_RDATATYPE_SIGNSEC) { nsecsig = header; } else if (cname_ok && header->type == RBTDB_RDATATYPE_SIGCNAME) { /* * If we get a CNAME match, we'll also need * its signature. */ cnamesig = header; } header_prev = header; } else { header_prev = header; } } if (empty_node) { /* * We have an exact match for the name, but there are no * extant rdatasets. That means that this node doesn't * meaningfully exist, and that we really have a partial match. */ NODE_UNLOCK(lock, locktype); if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) { result = find_coveringnsec(&search, name, nodep, now, foundname, rdataset, sigrdataset); if (result == DNS_R_COVERINGNSEC) { goto tree_exit; } } goto find_ns; } /* * If we didn't find what we were looking for... */ if (found == NULL || (DNS_TRUST_ADDITIONAL(found->trust) && ((options & DNS_DBFIND_ADDITIONALOK) == 0)) || (found->trust == dns_trust_glue && ((options & DNS_DBFIND_GLUEOK) == 0)) || (DNS_TRUST_PENDING(found->trust) && ((options & DNS_DBFIND_PENDINGOK) == 0))) { /* * Return covering NODATA NSEC record. */ if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0 && nsecheader != NULL) { if (nodep != NULL) { new_reference(search.rbtdb, node, locktype); *nodep = node; } bind_rdataset(search.rbtdb, node, nsecheader, search.now, locktype, rdataset); if (need_headerupdate(nsecheader, search.now)) { update = nsecheader; } if (nsecsig != NULL) { bind_rdataset(search.rbtdb, node, nsecsig, search.now, locktype, sigrdataset); if (need_headerupdate(nsecsig, search.now)) { updatesig = nsecsig; } } result = DNS_R_COVERINGNSEC; goto node_exit; } /* * This name was from a wild card. Look for a covering NSEC. */ if (found == NULL && (found_noqname || all_negative) && (search.options & DNS_DBFIND_COVERINGNSEC) != 0) { NODE_UNLOCK(lock, locktype); result = find_coveringnsec(&search, name, nodep, now, foundname, rdataset, sigrdataset); if (result == DNS_R_COVERINGNSEC) { goto tree_exit; } goto find_ns; } /* * If there is an NS rdataset at this node, then this is the * deepest zone cut. */ if (nsheader != NULL) { if (nodep != NULL) { new_reference(search.rbtdb, node, locktype); *nodep = node; } bind_rdataset(search.rbtdb, node, nsheader, search.now, locktype, rdataset); if (need_headerupdate(nsheader, search.now)) { update = nsheader; } if (nssig != NULL) { bind_rdataset(search.rbtdb, node, nssig, search.now, locktype, sigrdataset); if (need_headerupdate(nssig, search.now)) { updatesig = nssig; } } result = DNS_R_DELEGATION; goto node_exit; } /* * Go find the deepest zone cut. */ NODE_UNLOCK(lock, locktype); goto find_ns; } /* * We found what we were looking for, or we found a CNAME. */ if (nodep != NULL) { new_reference(search.rbtdb, node, locktype); *nodep = node; } if (NEGATIVE(found)) { /* * We found a negative cache entry. */ if (NXDOMAIN(found)) { result = DNS_R_NCACHENXDOMAIN; } else { result = DNS_R_NCACHENXRRSET; } } else if (type != found->type && type != dns_rdatatype_any && found->type == dns_rdatatype_cname) { /* * We weren't doing an ANY query and we found a CNAME instead * of the type we were looking for, so we need to indicate * that result to the caller. */ result = DNS_R_CNAME; } else { /* * An ordinary successful query! */ result = ISC_R_SUCCESS; } if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN || result == DNS_R_NCACHENXRRSET) { bind_rdataset(search.rbtdb, node, found, search.now, locktype, rdataset); if (need_headerupdate(found, search.now)) { update = found; } if (!NEGATIVE(found) && foundsig != NULL) { bind_rdataset(search.rbtdb, node, foundsig, search.now, locktype, sigrdataset); if (need_headerupdate(foundsig, search.now)) { updatesig = foundsig; } } } node_exit: if ((update != NULL || updatesig != NULL) && locktype != isc_rwlocktype_write) { NODE_UNLOCK(lock, locktype); NODE_LOCK(lock, isc_rwlocktype_write); locktype = isc_rwlocktype_write; POST(locktype); } if (update != NULL && need_headerupdate(update, search.now)) { update_header(search.rbtdb, update, search.now); } if (updatesig != NULL && need_headerupdate(updatesig, search.now)) { update_header(search.rbtdb, updatesig, search.now); } NODE_UNLOCK(lock, locktype); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); /* * If we found a zonecut but aren't going to use it, we have to * let go of it. */ if (search.need_cleanup) { node = search.zonecut; INSIST(node != NULL); lock = &(search.rbtdb->node_locks[node->locknum].lock); NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, isc_rwlocktype_read, isc_rwlocktype_none, false); NODE_UNLOCK(lock, isc_rwlocktype_read); } dns_rbtnodechain_reset(&search.chain); update_cachestats(search.rbtdb, result); return (result); } static isc_result_t cache_findzonecut(dns_db_t *db, const dns_name_t *name, unsigned int options, isc_stdtime_t now, dns_dbnode_t **nodep, dns_name_t *foundname, dns_name_t *dcname, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_rbtnode_t *node = NULL; nodelock_t *lock; isc_result_t result; rbtdb_search_t search; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *foundsig; unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA; isc_rwlocktype_t locktype; bool dcnull = (dcname == NULL); search.rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(search.rbtdb)); if (now == 0) { isc_stdtime_get(&now); } search.rbtversion = NULL; search.serial = 1; search.options = options; search.copy_name = false; search.need_cleanup = false; search.wild = false; search.zonecut = NULL; dns_fixedname_init(&search.zonecut_name); dns_rbtnodechain_init(&search.chain); search.now = now; if (dcnull) { dcname = foundname; } if ((options & DNS_DBFIND_NOEXACT) != 0) { rbtoptions |= DNS_RBTFIND_NOEXACT; } RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); /* * Search down from the root of the tree. */ result = dns_rbt_findnode(search.rbtdb->tree, name, dcname, &node, &search.chain, rbtoptions, NULL, &search); if (result == DNS_R_PARTIALMATCH) { result = find_deepest_zonecut(&search, node, nodep, foundname, rdataset, sigrdataset); goto tree_exit; } else if (result != ISC_R_SUCCESS) { goto tree_exit; } else if (!dcnull) { dns_name_copy(dcname, foundname); } /* * We now go looking for an NS rdataset at the node. */ lock = &(search.rbtdb->node_locks[node->locknum].lock); locktype = isc_rwlocktype_read; NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; header_prev = NULL; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; if (check_stale_header(node, header, &locktype, lock, &search, &header_prev)) { /* * The function dns_rbt_findnode found us the a matching * node for 'name' and stored the result in 'dcname'. * This is the deepest known zonecut in our database. * However, this node may be stale and if serve-stale * is not enabled (in other words 'stale-answer-enable' * is set to no), this node may not be used as a * zonecut we know about. If so, find the deepest * zonecut from this node up and return that instead. */ NODE_UNLOCK(lock, locktype); result = find_deepest_zonecut(&search, node, nodep, foundname, rdataset, sigrdataset); dns_name_copy(foundname, dcname); goto tree_exit; } else if (EXISTS(header) && !ANCIENT(header)) { /* * If we found a type we were looking for, remember * it. */ if (header->type == dns_rdatatype_ns) { /* * Remember a NS rdataset even if we're * not specifically looking for it, because * we might need it later. */ found = header; } else if (header->type == RBTDB_RDATATYPE_SIGNS) { /* * If we need the NS rdataset, we'll also * need its signature. */ foundsig = header; } header_prev = header; } else { header_prev = header; } } if (found == NULL) { /* * No NS records here. */ NODE_UNLOCK(lock, locktype); result = find_deepest_zonecut(&search, node, nodep, foundname, rdataset, sigrdataset); goto tree_exit; } if (nodep != NULL) { new_reference(search.rbtdb, node, locktype); *nodep = node; } bind_rdataset(search.rbtdb, node, found, search.now, locktype, rdataset); if (foundsig != NULL) { bind_rdataset(search.rbtdb, node, foundsig, search.now, locktype, sigrdataset); } if (need_headerupdate(found, search.now) || (foundsig != NULL && need_headerupdate(foundsig, search.now))) { if (locktype != isc_rwlocktype_write) { NODE_UNLOCK(lock, locktype); NODE_LOCK(lock, isc_rwlocktype_write); locktype = isc_rwlocktype_write; POST(locktype); } if (need_headerupdate(found, search.now)) { update_header(search.rbtdb, found, search.now); } if (foundsig != NULL && need_headerupdate(foundsig, search.now)) { update_header(search.rbtdb, foundsig, search.now); } } NODE_UNLOCK(lock, locktype); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); INSIST(!search.need_cleanup); dns_rbtnodechain_reset(&search.chain); if (result == DNS_R_DELEGATION) { result = ISC_R_SUCCESS; } return (result); } static void attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *node = (dns_rbtnode_t *)source; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(targetp != NULL && *targetp == NULL); isc_refcount_increment(&node->references); *targetp = source; } static void detachnode(dns_db_t *db, dns_dbnode_t **targetp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *node; bool want_free = false; bool inactive = false; rbtdb_nodelock_t *nodelock; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(targetp != NULL && *targetp != NULL); node = (dns_rbtnode_t *)(*targetp); nodelock = &rbtdb->node_locks[node->locknum]; NODE_LOCK(&nodelock->lock, isc_rwlocktype_read); if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, isc_rwlocktype_none, false)) { if (isc_refcount_current(&nodelock->references) == 0 && nodelock->exiting) { inactive = true; } } NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read); *targetp = NULL; if (inactive) { RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); rbtdb->active--; if (rbtdb->active == 0) { want_free = true; } RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (want_free) { char buf[DNS_NAME_FORMATSIZE]; if (dns_name_dynamic(&rbtdb->common.origin)) { dns_name_format(&rbtdb->common.origin, buf, sizeof(buf)); } else { strlcpy(buf, "", sizeof(buf)); } isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), "calling free_rbtdb(%s)", buf); free_rbtdb(rbtdb, true, NULL); } } } static isc_result_t expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = node; rdatasetheader_t *header; bool force_expire = false; /* * These are the category and module used by the cache cleaner. */ bool log = false; isc_logcategory_t *category = DNS_LOGCATEGORY_DATABASE; isc_logmodule_t *module = DNS_LOGMODULE_CACHE; int level = ISC_LOG_DEBUG(2); char printname[DNS_NAME_FORMATSIZE]; REQUIRE(VALID_RBTDB(rbtdb)); /* * Caller must hold a tree lock. */ if (now == 0) { isc_stdtime_get(&now); } if (isc_mem_isovermem(rbtdb->common.mctx)) { /* * Force expire with 25% probability. * XXXDCL Could stand to have a better policy, like LRU. */ force_expire = (rbtnode->down == NULL && (isc_random32() % 4) == 0); /* * Note that 'log' can be true IFF overmem is also true. * overmem can currently only be true for cache * databases -- hence all of the "overmem cache" log strings. */ log = isc_log_wouldlog(dns_lctx, level); if (log) { isc_log_write( dns_lctx, category, module, level, "overmem cache: %s %s", force_expire ? "FORCE" : "check", dns_rbt_formatnodename(rbtnode, printname, sizeof(printname))); } } /* * We may not need write access, but this code path is not performance * sensitive, so it should be okay to always lock as a writer. */ NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); for (header = rbtnode->data; header != NULL; header = header->next) { if (header->rdh_ttl + STALE_TTL(header, rbtdb) <= now - RBTDB_VIRTUAL) { /* * We don't check if refcurrent(rbtnode) == 0 and try * to free like we do in cache_find(), because * refcurrent(rbtnode) must be non-zero. This is so * because 'node' is an argument to the function. */ mark_header_ancient(rbtdb, header); if (log) { isc_log_write(dns_lctx, category, module, level, "overmem cache: ancient %s", printname); } } else if (force_expire) { if (!RETAIN(header)) { set_ttl(rbtdb, header, 0); mark_header_ancient(rbtdb, header); } else if (log) { isc_log_write(dns_lctx, category, module, level, "overmem cache: " "reprieve by RETAIN() %s", printname); } } else if (isc_mem_isovermem(rbtdb->common.mctx) && log) { isc_log_write(dns_lctx, category, module, level, "overmem cache: saved %s", printname); } } NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); return (ISC_R_SUCCESS); } static void overmem(dns_db_t *db, bool over) { /* This is an empty callback. See adb.c:water() */ UNUSED(db); UNUSED(over); return; } static void printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = node; bool first; uint32_t refs; REQUIRE(VALID_RBTDB(rbtdb)); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); refs = isc_refcount_current(&rbtnode->references); fprintf(out, "node %p, %" PRIu32 " references, locknum = %u\n", rbtnode, refs, rbtnode->locknum); if (rbtnode->data != NULL) { rdatasetheader_t *current, *top_next; for (current = rbtnode->data; current != NULL; current = top_next) { top_next = current->next; first = true; fprintf(out, "\ttype %u", current->type); do { uint_least16_t attributes = atomic_load_acquire( ¤t->attributes); if (!first) { fprintf(out, "\t"); } first = false; fprintf(out, "\tserial = %lu, ttl = %u, " "trust = %u, attributes = %" PRIuLEAST16 ", " "resign = %u\n", (unsigned long)current->serial, current->rdh_ttl, current->trust, attributes, (current->resign << 1) | current->resign_lsb); current = current->down; } while (current != NULL); } } else { fprintf(out, "(empty)\n"); } NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); } static isc_result_t createiterator(dns_db_t *db, unsigned int options, dns_dbiterator_t **iteratorp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_dbiterator_t *rbtdbiter; REQUIRE(VALID_RBTDB(rbtdb)); rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter)); rbtdbiter->common.methods = &dbiterator_methods; rbtdbiter->common.db = NULL; dns_db_attach(db, &rbtdbiter->common.db); rbtdbiter->common.relative_names = ((options & DNS_DB_RELATIVENAMES) != 0); rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC; rbtdbiter->common.cleaning = false; rbtdbiter->paused = true; rbtdbiter->tree_locked = isc_rwlocktype_none; rbtdbiter->result = ISC_R_SUCCESS; dns_fixedname_init(&rbtdbiter->name); dns_fixedname_init(&rbtdbiter->origin); rbtdbiter->node = NULL; rbtdbiter->delcnt = 0; rbtdbiter->nsec3only = ((options & DNS_DB_NSEC3ONLY) != 0); rbtdbiter->nonsec3 = ((options & DNS_DB_NONSEC3) != 0); memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions)); dns_rbtnodechain_init(&rbtdbiter->chain); dns_rbtnodechain_init(&rbtdbiter->nsec3chain); if (rbtdbiter->nsec3only) { rbtdbiter->current = &rbtdbiter->nsec3chain; } else { rbtdbiter->current = &rbtdbiter->chain; } *iteratorp = (dns_dbiterator_t *)rbtdbiter; return (ISC_R_SUCCESS); } static isc_result_t zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, dns_rdatatype_t type, dns_rdatatype_t covers, isc_stdtime_t now, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rdatasetheader_t *header, *header_next, *found, *foundsig; rbtdb_serial_t serial; rbtdb_version_t *rbtversion = version; bool close_version = false; rbtdb_rdatatype_t matchtype, sigmatchtype; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(type != dns_rdatatype_any); INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb); if (rbtversion == NULL) { currentversion(db, (dns_dbversion_t **)(void *)(&rbtversion)); close_version = true; } serial = rbtversion->serial; now = 0; NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); found = NULL; foundsig = NULL; matchtype = RBTDB_RDATATYPE_VALUE(type, covers); if (covers == 0) { sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type); } else { sigmatchtype = 0; } for (header = rbtnode->data; header != NULL; header = header_next) { header_next = header->next; do { if (header->serial <= serial && !IGNORE(header)) { /* * Is this a "this rdataset doesn't * exist" record? */ if (NONEXISTENT(header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { /* * We have an active, extant rdataset. If it's a * type we're looking for, remember it. */ if (header->type == matchtype) { found = header; if (foundsig != NULL) { break; } } else if (header->type == sigmatchtype) { foundsig = header; if (found != NULL) { break; } } } } if (found != NULL) { bind_rdataset(rbtdb, rbtnode, found, now, isc_rwlocktype_read, rdataset); if (foundsig != NULL) { bind_rdataset(rbtdb, rbtnode, foundsig, now, isc_rwlocktype_read, sigrdataset); } } NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); if (close_version) { closeversion(db, (dns_dbversion_t **)(void *)(&rbtversion), false); } if (found == NULL) { return (ISC_R_NOTFOUND); } return (ISC_R_SUCCESS); } static isc_result_t cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, dns_rdatatype_t type, dns_rdatatype_t covers, isc_stdtime_t now, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rdatasetheader_t *header, *header_next, *found, *foundsig; rbtdb_rdatatype_t matchtype, sigmatchtype, negtype; isc_result_t result; nodelock_t *lock; isc_rwlocktype_t locktype; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(type != dns_rdatatype_any); UNUSED(version); result = ISC_R_SUCCESS; if (now == 0) { isc_stdtime_get(&now); } lock = &rbtdb->node_locks[rbtnode->locknum].lock; locktype = isc_rwlocktype_read; NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; matchtype = RBTDB_RDATATYPE_VALUE(type, covers); negtype = RBTDB_RDATATYPE_VALUE(0, type); if (covers == 0) { sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type); } else { sigmatchtype = 0; } for (header = rbtnode->data; header != NULL; header = header_next) { header_next = header->next; if (!ACTIVE(header, now)) { if ((header->rdh_ttl + STALE_TTL(header, rbtdb) < now - RBTDB_VIRTUAL) && (locktype == isc_rwlocktype_write || NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) { /* * We update the node's status only when we * can get write access. */ locktype = isc_rwlocktype_write; /* * We don't check if refcurrent(rbtnode) == 0 * and try to free like we do in cache_find(), * because refcurrent(rbtnode) must be * non-zero. This is so because 'node' is an * argument to the function. */ mark_header_ancient(rbtdb, header); } } else if (EXISTS(header) && !ANCIENT(header)) { if (header->type == matchtype) { found = header; } else if (header->type == RBTDB_RDATATYPE_NCACHEANY || header->type == negtype) { found = header; } else if (header->type == sigmatchtype) { foundsig = header; } } } if (found != NULL) { bind_rdataset(rbtdb, rbtnode, found, now, locktype, rdataset); if (!NEGATIVE(found) && foundsig != NULL) { bind_rdataset(rbtdb, rbtnode, foundsig, now, locktype, sigrdataset); } } NODE_UNLOCK(lock, locktype); if (found == NULL) { return (ISC_R_NOTFOUND); } if (NEGATIVE(found)) { /* * We found a negative cache entry. */ if (NXDOMAIN(found)) { result = DNS_R_NCACHENXDOMAIN; } else { result = DNS_R_NCACHENXRRSET; } } update_cachestats(rbtdb, result); return (result); } static isc_result_t allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, unsigned int options, isc_stdtime_t now, dns_rdatasetiter_t **iteratorp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rbtdb_version_t *rbtversion = version; rbtdb_rdatasetiter_t *iterator; REQUIRE(VALID_RBTDB(rbtdb)); iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator)); if ((db->attributes & DNS_DBATTR_CACHE) == 0) { now = 0; if (rbtversion == NULL) { currentversion( db, (dns_dbversion_t **)(void *)(&rbtversion)); } else { INSIST(rbtversion->rbtdb == rbtdb); (void)isc_refcount_increment(&rbtversion->references); } } else { if (now == 0) { isc_stdtime_get(&now); } rbtversion = NULL; } iterator->common.magic = DNS_RDATASETITER_MAGIC; iterator->common.methods = &rdatasetiter_methods; iterator->common.db = db; iterator->common.node = node; iterator->common.version = (dns_dbversion_t *)rbtversion; iterator->common.options = options; iterator->common.now = now; isc_refcount_increment(&rbtnode->references); iterator->current = NULL; *iteratorp = (dns_rdatasetiter_t *)iterator; return (ISC_R_SUCCESS); } static bool cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) { rdatasetheader_t *header, *header_next; bool cname, other_data; dns_rdatatype_t rdtype; /* * The caller must hold the node lock. */ /* * Look for CNAME and "other data" rdatasets active in our version. */ cname = false; other_data = false; for (header = node->data; header != NULL; header = header_next) { header_next = header->next; if (header->type == dns_rdatatype_cname) { /* * Look for an active extant CNAME. */ do { if (header->serial <= serial && !IGNORE(header)) { /* * Is this a "this rdataset doesn't * exist" record? */ if (NONEXISTENT(header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { cname = true; } } else { /* * Look for active extant "other data". * * "Other data" is any rdataset whose type is not * KEY, NSEC, SIG or RRSIG. */ rdtype = RBTDB_RDATATYPE_BASE(header->type); if (rdtype != dns_rdatatype_key && rdtype != dns_rdatatype_sig && rdtype != dns_rdatatype_nsec && rdtype != dns_rdatatype_rrsig) { /* * Is it active and extant? */ do { if (header->serial <= serial && !IGNORE(header)) { /* * Is this a "this rdataset * doesn't exist" record? */ if (NONEXISTENT(header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { other_data = true; } } } } if (cname && other_data) { return (true); } return (false); } static void resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader) { INSIST(!IS_CACHE(rbtdb)); INSIST(newheader->heap_index == 0); INSIST(!ISC_LINK_LINKED(newheader, link)); isc_heap_insert(rbtdb->heaps[idx], newheader); } /* * node write lock must be held. */ static void resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, rdatasetheader_t *header) { /* * Remove the old header from the heap */ if (header != NULL && header->heap_index != 0) { isc_heap_delete(rbtdb->heaps[header->node->locknum], header->heap_index); header->heap_index = 0; if (version != NULL) { new_reference(rbtdb, header->node, isc_rwlocktype_write); ISC_LIST_APPEND(version->resigned_list, header, link); } } } static uint64_t recordsize(rdatasetheader_t *header, unsigned int namelen) { return (dns_rdataslab_rdatasize((unsigned char *)header, sizeof(*header)) + sizeof(dns_ttl_t) + sizeof(dns_rdatatype_t) + sizeof(dns_rdataclass_t) + namelen); } static void update_recordsandxfrsize(bool add, rbtdb_version_t *rbtversion, rdatasetheader_t *header, unsigned int namelen) { unsigned char *hdr = (unsigned char *)header; size_t hdrsize = sizeof(*header); RWLOCK(&rbtversion->rwlock, isc_rwlocktype_write); if (add) { rbtversion->records += dns_rdataslab_count(hdr, hdrsize); rbtversion->xfrsize += recordsize(header, namelen); } else { rbtversion->records -= dns_rdataslab_count(hdr, hdrsize); rbtversion->xfrsize -= recordsize(header, namelen); } RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_write); } /* * write lock on rbtnode must be held. */ static isc_result_t add32(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, const dns_name_t *nodename, rbtdb_version_t *rbtversion, rdatasetheader_t *newheader, unsigned int options, bool loading, dns_rdataset_t *addedrdataset, isc_stdtime_t now) { rbtdb_changed_t *changed = NULL; rdatasetheader_t *topheader = NULL, *topheader_prev = NULL; rdatasetheader_t *header = NULL, *sigheader = NULL; unsigned char *merged = NULL; isc_result_t result; bool header_nx; bool newheader_nx; bool merge; dns_rdatatype_t rdtype, covers; rbtdb_rdatatype_t negtype, sigtype; dns_trust_t trust; int idx; /* * Add an rdatasetheader_t to a node. */ /* * Caller must be holding the node lock. */ if ((options & DNS_DBADD_MERGE) != 0) { REQUIRE(rbtversion != NULL); merge = true; } else { merge = false; } if ((options & DNS_DBADD_FORCE) != 0) { trust = dns_trust_ultimate; } else { trust = newheader->trust; } if (rbtversion != NULL && !loading) { /* * We always add a changed record, even if no changes end up * being made to this node, because it's harmless and * simplifies the code. */ changed = add_changed(rbtdb, rbtversion, rbtnode); if (changed == NULL) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (ISC_R_NOMEMORY); } } newheader_nx = NONEXISTENT(newheader) ? true : false; topheader_prev = NULL; sigheader = NULL; negtype = 0; if (rbtversion == NULL && !newheader_nx) { rdtype = RBTDB_RDATATYPE_BASE(newheader->type); covers = RBTDB_RDATATYPE_EXT(newheader->type); sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, covers); if (NEGATIVE(newheader)) { /* * We're adding a negative cache entry. */ if (covers == dns_rdatatype_any) { /* * If we're adding an negative cache entry * which covers all types (NXDOMAIN, * NODATA(QTYPE=ANY)), * * We make all other data ancient so that the * only rdataset that can be found at this * node is the negative cache entry. */ for (topheader = rbtnode->data; topheader != NULL; topheader = topheader->next) { set_ttl(rbtdb, topheader, 0); mark_header_ancient(rbtdb, topheader); } goto find_header; } /* * Otherwise look for any RRSIGs of the given * type so they can be marked ancient later. */ for (topheader = rbtnode->data; topheader != NULL; topheader = topheader->next) { if (topheader->type == sigtype) { sigheader = topheader; } } negtype = RBTDB_RDATATYPE_VALUE(covers, 0); } else { /* * We're adding something that isn't a * negative cache entry. Look for an extant * non-ancient NXDOMAIN/NODATA(QTYPE=ANY) negative * cache entry. If we're adding an RRSIG, also * check for an extant non-ancient NODATA ncache * entry which covers the same type as the RRSIG. */ for (topheader = rbtnode->data; topheader != NULL; topheader = topheader->next) { if ((topheader->type == RBTDB_RDATATYPE_NCACHEANY) || (newheader->type == sigtype && topheader->type == RBTDB_RDATATYPE_VALUE(0, covers))) { break; } } if (topheader != NULL && EXISTS(topheader) && ACTIVE(topheader, now)) { /* * Found one. */ if (trust < topheader->trust) { /* * The NXDOMAIN/NODATA(QTYPE=ANY) * is more trusted. */ free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if (addedrdataset != NULL) { bind_rdataset( rbtdb, rbtnode, topheader, now, isc_rwlocktype_write, addedrdataset); } return (DNS_R_UNCHANGED); } /* * The new rdataset is better. Expire the * ncache entry. */ set_ttl(rbtdb, topheader, 0); mark_header_ancient(rbtdb, topheader); topheader = NULL; goto find_header; } negtype = RBTDB_RDATATYPE_VALUE(0, rdtype); } } for (topheader = rbtnode->data; topheader != NULL; topheader = topheader->next) { if (topheader->type == newheader->type || topheader->type == negtype) { break; } topheader_prev = topheader; } find_header: /* * If header isn't NULL, we've found the right type. There may be * IGNORE rdatasets between the top of the chain and the first real * data. We skip over them. */ header = topheader; while (header != NULL && IGNORE(header)) { header = header->down; } if (header != NULL) { header_nx = NONEXISTENT(header) ? true : false; /* * Deleting an already non-existent rdataset has no effect. */ if (header_nx && newheader_nx) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (DNS_R_UNCHANGED); } /* * Trying to add an rdataset with lower trust to a cache * DB has no effect, provided that the cache data isn't * stale. If the cache data is stale, new lower trust * data will supersede it below. Unclear what the best * policy is here. */ if (rbtversion == NULL && trust < header->trust && (ACTIVE(header, now) || header_nx)) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if (addedrdataset != NULL) { bind_rdataset(rbtdb, rbtnode, header, now, isc_rwlocktype_write, addedrdataset); } return (DNS_R_UNCHANGED); } /* * Don't merge if a nonexistent rdataset is involved. */ if (merge && (header_nx || newheader_nx)) { merge = false; } /* * If 'merge' is true, we'll try to create a new rdataset * that is the union of 'newheader' and 'header'. */ if (merge) { unsigned int flags = 0; INSIST(rbtversion->serial >= header->serial); merged = NULL; result = ISC_R_SUCCESS; if ((options & DNS_DBADD_EXACT) != 0) { flags |= DNS_RDATASLAB_EXACT; } /* * TTL use here is irrelevant to the cache; * merge is only done with zonedbs. */ if ((options & DNS_DBADD_EXACTTTL) != 0 && newheader->rdh_ttl != header->rdh_ttl) { result = DNS_R_NOTEXACT; } else if (newheader->rdh_ttl != header->rdh_ttl) { flags |= DNS_RDATASLAB_FORCE; } if (result == ISC_R_SUCCESS) { result = dns_rdataslab_merge( (unsigned char *)header, (unsigned char *)newheader, (unsigned int)(sizeof(*newheader)), rbtdb->common.mctx, rbtdb->common.rdclass, (dns_rdatatype_t)header->type, flags, &merged); } if (result == ISC_R_SUCCESS) { /* * If 'header' has the same serial number as * we do, we could clean it up now if we knew * that our caller had no references to it. * We don't know this, however, so we leave it * alone. It will get cleaned up when * clean_zone_node() runs. */ free_rdataset(rbtdb, rbtdb->common.mctx, newheader); newheader = (rdatasetheader_t *)merged; init_rdataset(rbtdb, newheader); update_newheader(newheader, header); if (loading && RESIGN(newheader) && RESIGN(header) && resign_sooner(header, newheader)) { newheader->resign = header->resign; newheader->resign_lsb = header->resign_lsb; } } else { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (result); } } /* * Don't replace existing NS, A and AAAA RRsets in the * cache if they are already exist. This prevents named * being locked to old servers. Don't lower trust of * existing record if the update is forced. Nothing * special to be done w.r.t stale data; it gets replaced * normally further down. */ if (IS_CACHE(rbtdb) && ACTIVE(header, now) && header->type == dns_rdatatype_ns && !header_nx && !newheader_nx && header->trust >= newheader->trust && dns_rdataslab_equalx((unsigned char *)header, (unsigned char *)newheader, (unsigned int)(sizeof(*newheader)), rbtdb->common.rdclass, (dns_rdatatype_t)header->type)) { /* * Honour the new ttl if it is less than the * older one. */ if (header->rdh_ttl > newheader->rdh_ttl) { set_ttl(rbtdb, header, newheader->rdh_ttl); } if (header->last_used != now) { update_header(rbtdb, header, now); } if (header->noqname == NULL && newheader->noqname != NULL) { header->noqname = newheader->noqname; newheader->noqname = NULL; } if (header->closest == NULL && newheader->closest != NULL) { header->closest = newheader->closest; newheader->closest = NULL; } free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if (addedrdataset != NULL) { bind_rdataset(rbtdb, rbtnode, header, now, isc_rwlocktype_write, addedrdataset); } return (ISC_R_SUCCESS); } /* * If we have will be replacing a NS RRset force its TTL * to be no more than the current NS RRset's TTL. This * ensures the delegations that are withdrawn are honoured. */ if (IS_CACHE(rbtdb) && ACTIVE(header, now) && header->type == dns_rdatatype_ns && !header_nx && !newheader_nx && header->trust <= newheader->trust) { if (newheader->rdh_ttl > header->rdh_ttl) { newheader->rdh_ttl = header->rdh_ttl; } } if (IS_CACHE(rbtdb) && ACTIVE(header, now) && (options & DNS_DBADD_PREFETCH) == 0 && (header->type == dns_rdatatype_a || header->type == dns_rdatatype_aaaa || header->type == dns_rdatatype_ds || header->type == RBTDB_RDATATYPE_SIGDS) && !header_nx && !newheader_nx && header->trust >= newheader->trust && dns_rdataslab_equal((unsigned char *)header, (unsigned char *)newheader, (unsigned int)(sizeof(*newheader)))) { /* * Honour the new ttl if it is less than the * older one. */ if (header->rdh_ttl > newheader->rdh_ttl) { set_ttl(rbtdb, header, newheader->rdh_ttl); } if (header->last_used != now) { update_header(rbtdb, header, now); } if (header->noqname == NULL && newheader->noqname != NULL) { header->noqname = newheader->noqname; newheader->noqname = NULL; } if (header->closest == NULL && newheader->closest != NULL) { header->closest = newheader->closest; newheader->closest = NULL; } free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if (addedrdataset != NULL) { bind_rdataset(rbtdb, rbtnode, header, now, isc_rwlocktype_write, addedrdataset); } return (ISC_R_SUCCESS); } INSIST(rbtversion == NULL || rbtversion->serial >= topheader->serial); if (loading) { newheader->down = NULL; idx = newheader->node->locknum; if (IS_CACHE(rbtdb)) { if (ZEROTTL(newheader)) { newheader->last_used = atomic_load(&rbtdb->last_used) + 1; ISC_LIST_APPEND(rbtdb->rdatasets[idx], newheader, link); } else { ISC_LIST_PREPEND(rbtdb->rdatasets[idx], newheader, link); } INSIST(rbtdb->heaps != NULL); isc_heap_insert(rbtdb->heaps[idx], newheader); } else if (RESIGN(newheader)) { resign_insert(rbtdb, idx, newheader); /* * Don't call resign_delete as we don't need * to reverse the delete. The free_rdataset * call below will clean up the heap entry. */ } /* * There are no other references to 'header' when * loading, so we MAY clean up 'header' now. * Since we don't generate changed records when * loading, we MUST clean up 'header' now. */ if (topheader_prev != NULL) { topheader_prev->next = newheader; } else { rbtnode->data = newheader; } newheader->next = topheader->next; if (rbtversion != NULL && !header_nx) { update_recordsandxfrsize(false, rbtversion, header, nodename->length); } free_rdataset(rbtdb, rbtdb->common.mctx, header); } else { idx = newheader->node->locknum; if (IS_CACHE(rbtdb)) { INSIST(rbtdb->heaps != NULL); isc_heap_insert(rbtdb->heaps[idx], newheader); if (ZEROTTL(newheader)) { newheader->last_used = atomic_load(&rbtdb->last_used) + 1; ISC_LIST_APPEND(rbtdb->rdatasets[idx], newheader, link); } else { ISC_LIST_PREPEND(rbtdb->rdatasets[idx], newheader, link); } } else if (RESIGN(newheader)) { resign_insert(rbtdb, idx, newheader); resign_delete(rbtdb, rbtversion, header); } if (topheader_prev != NULL) { topheader_prev->next = newheader; } else { rbtnode->data = newheader; } newheader->next = topheader->next; newheader->down = topheader; topheader->next = newheader; rbtnode->dirty = 1; if (changed != NULL) { changed->dirty = true; } if (rbtversion == NULL) { set_ttl(rbtdb, header, 0); mark_header_ancient(rbtdb, header); if (sigheader != NULL) { set_ttl(rbtdb, sigheader, 0); mark_header_ancient(rbtdb, sigheader); } } if (rbtversion != NULL && !header_nx) { update_recordsandxfrsize(false, rbtversion, header, nodename->length); } } } else { /* * No non-IGNORED rdatasets of the given type exist at * this node. */ /* * If we're trying to delete the type, don't bother. */ if (newheader_nx) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (DNS_R_UNCHANGED); } idx = newheader->node->locknum; if (IS_CACHE(rbtdb)) { isc_heap_insert(rbtdb->heaps[idx], newheader); if (ZEROTTL(newheader)) { ISC_LIST_APPEND(rbtdb->rdatasets[idx], newheader, link); } else { ISC_LIST_PREPEND(rbtdb->rdatasets[idx], newheader, link); } } else if (RESIGN(newheader)) { resign_insert(rbtdb, idx, newheader); resign_delete(rbtdb, rbtversion, header); } if (topheader != NULL) { /* * We have an list of rdatasets of the given type, * but they're all marked IGNORE. We simply insert * the new rdataset at the head of the list. * * Ignored rdatasets cannot occur during loading, so * we INSIST on it. */ INSIST(!loading); INSIST(rbtversion == NULL || rbtversion->serial >= topheader->serial); if (topheader_prev != NULL) { topheader_prev->next = newheader; } else { rbtnode->data = newheader; } newheader->next = topheader->next; newheader->down = topheader; topheader->next = newheader; rbtnode->dirty = 1; if (changed != NULL) { changed->dirty = true; } } else { /* * No rdatasets of the given type exist at the node. */ newheader->next = rbtnode->data; newheader->down = NULL; rbtnode->data = newheader; } } if (rbtversion != NULL && !newheader_nx) { update_recordsandxfrsize(true, rbtversion, newheader, nodename->length); } /* * Check if the node now contains CNAME and other data. */ if (rbtversion != NULL && cname_and_other_data(rbtnode, rbtversion->serial)) { return (DNS_R_CNAMEANDOTHER); } if (addedrdataset != NULL) { bind_rdataset(rbtdb, rbtnode, newheader, now, isc_rwlocktype_write, addedrdataset); } return (ISC_R_SUCCESS); } static bool delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rbtdb_rdatatype_t type) { if (IS_CACHE(rbtdb)) { if (type == dns_rdatatype_dname) { return (true); } else { return (false); } } else if (type == dns_rdatatype_dname || (type == dns_rdatatype_ns && (node != rbtdb->origin_node || IS_STUB(rbtdb)))) { return (true); } return (false); } static isc_result_t addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader, dns_rdataset_t *rdataset) { struct noqname *noqname; isc_mem_t *mctx = rbtdb->common.mctx; dns_name_t name; dns_rdataset_t neg, negsig; isc_result_t result; isc_region_t r; dns_name_init(&name, NULL); dns_rdataset_init(&neg); dns_rdataset_init(&negsig); result = dns_rdataset_getnoqname(rdataset, &name, &neg, &negsig); RUNTIME_CHECK(result == ISC_R_SUCCESS); noqname = isc_mem_get(mctx, sizeof(*noqname)); dns_name_init(&noqname->name, NULL); noqname->neg = NULL; noqname->negsig = NULL; noqname->type = neg.type; dns_name_dup(&name, mctx, &noqname->name); result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0); if (result != ISC_R_SUCCESS) { goto cleanup; } noqname->neg = r.base; result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0); if (result != ISC_R_SUCCESS) { goto cleanup; } noqname->negsig = r.base; dns_rdataset_disassociate(&neg); dns_rdataset_disassociate(&negsig); newheader->noqname = noqname; return (ISC_R_SUCCESS); cleanup: dns_rdataset_disassociate(&neg); dns_rdataset_disassociate(&negsig); free_noqname(mctx, &noqname); return (result); } static isc_result_t addclosest(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader, dns_rdataset_t *rdataset) { struct noqname *closest; isc_mem_t *mctx = rbtdb->common.mctx; dns_name_t name; dns_rdataset_t neg, negsig; isc_result_t result; isc_region_t r; dns_name_init(&name, NULL); dns_rdataset_init(&neg); dns_rdataset_init(&negsig); result = dns_rdataset_getclosest(rdataset, &name, &neg, &negsig); RUNTIME_CHECK(result == ISC_R_SUCCESS); closest = isc_mem_get(mctx, sizeof(*closest)); dns_name_init(&closest->name, NULL); closest->neg = NULL; closest->negsig = NULL; closest->type = neg.type; dns_name_dup(&name, mctx, &closest->name); result = dns_rdataslab_fromrdataset(&neg, mctx, &r, 0); if (result != ISC_R_SUCCESS) { goto cleanup; } closest->neg = r.base; result = dns_rdataslab_fromrdataset(&negsig, mctx, &r, 0); if (result != ISC_R_SUCCESS) { goto cleanup; } closest->negsig = r.base; dns_rdataset_disassociate(&neg); dns_rdataset_disassociate(&negsig); newheader->closest = closest; return (ISC_R_SUCCESS); cleanup: dns_rdataset_disassociate(&neg); dns_rdataset_disassociate(&negsig); free_noqname(mctx, &closest); return (result); } static dns_dbmethods_t zone_methods; static size_t rdataset_size(rdatasetheader_t *header) { if (!NONEXISTENT(header)) { return (dns_rdataslab_size((unsigned char *)header, sizeof(*header))); } return (sizeof(*header)); } static isc_result_t addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options, dns_rdataset_t *addedrdataset) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rbtdb_version_t *rbtversion = version; isc_region_t region; rdatasetheader_t *newheader; rdatasetheader_t *header; isc_result_t result; bool delegating; bool newnsec; bool tree_locked = false; bool cache_is_overmem = false; dns_fixedname_t fixed; dns_name_t *name; REQUIRE(VALID_RBTDB(rbtdb)); INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb); if (rbtdb->common.methods == &zone_methods) { /* * SOA records are only allowed at top of zone. */ if (rdataset->type == dns_rdatatype_soa && node != rbtdb->origin_node) { return (DNS_R_NOTZONETOP); } RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 && (rdataset->type == dns_rdatatype_nsec3 || rdataset->covers == dns_rdatatype_nsec3)) || (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 && rdataset->type != dns_rdatatype_nsec3 && rdataset->covers != dns_rdatatype_nsec3))); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); } if (rbtversion == NULL) { if (now == 0) { isc_stdtime_get(&now); } } else { now = 0; } result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx, ®ion, sizeof(rdatasetheader_t)); if (result != ISC_R_SUCCESS) { return (result); } name = dns_fixedname_initname(&fixed); nodefullname(db, node, name); dns_rdataset_getownercase(rdataset, name); newheader = (rdatasetheader_t *)region.base; init_rdataset(rbtdb, newheader); setownercase(newheader, name); set_ttl(rbtdb, newheader, rdataset->ttl + now); newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type, rdataset->covers); atomic_init(&newheader->attributes, 0); if (rdataset->ttl == 0U) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_ZEROTTL); } newheader->noqname = NULL; newheader->closest = NULL; atomic_init(&newheader->count, atomic_fetch_add_relaxed(&init_count, 1)); newheader->trust = rdataset->trust; newheader->last_used = now; newheader->node = rbtnode; if (rbtversion != NULL) { newheader->serial = rbtversion->serial; now = 0; if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN); newheader->resign = (isc_stdtime_t)(dns_time64_from32( rdataset->resign) >> 1); newheader->resign_lsb = rdataset->resign & 0x1; } else { newheader->resign = 0; newheader->resign_lsb = 0; } } else { newheader->serial = 1; newheader->resign = 0; newheader->resign_lsb = 0; if ((rdataset->attributes & DNS_RDATASETATTR_PREFETCH) != 0) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_PREFETCH); } if ((rdataset->attributes & DNS_RDATASETATTR_NEGATIVE) != 0) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NEGATIVE); } if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_NXDOMAIN); } if ((rdataset->attributes & DNS_RDATASETATTR_OPTOUT) != 0) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_OPTOUT); } if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) { result = addnoqname(rbtdb, newheader, rdataset); if (result != ISC_R_SUCCESS) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (result); } } if ((rdataset->attributes & DNS_RDATASETATTR_CLOSEST) != 0) { result = addclosest(rbtdb, newheader, rdataset); if (result != ISC_R_SUCCESS) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); return (result); } } } /* * If we're adding a delegation type (e.g. NS or DNAME for a zone, * just DNAME for the cache), then we need to set the callback bit * on the node. */ if (delegating_type(rbtdb, rbtnode, rdataset->type)) { delegating = true; } else { delegating = false; } /* * Add to the auxiliary NSEC tree if we're adding an NSEC record. */ RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); if (rbtnode->nsec != DNS_RBT_NSEC_HAS_NSEC && rdataset->type == dns_rdatatype_nsec) { newnsec = true; } else { newnsec = false; } RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); /* * If we're adding a delegation type, adding to the auxiliary NSEC * tree, or the DB is a cache in an overmem state, hold an * exclusive lock on the tree. In the latter case the lock does * not necessarily have to be acquired but it will help purge * ancient entries more effectively. */ if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) { cache_is_overmem = true; } if (delegating || newnsec || cache_is_overmem) { tree_locked = true; RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } if (cache_is_overmem) { overmem_purge(rbtdb, newheader, tree_locked); } NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); if (rbtdb->rrsetstats != NULL) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_STATCOUNT); update_rrsetstats(rbtdb, newheader->type, atomic_load_acquire(&newheader->attributes), true); } if (IS_CACHE(rbtdb)) { if (tree_locked) { cleanup_dead_nodes(rbtdb, rbtnode->locknum); } header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1); if (header != NULL) { dns_ttl_t rdh_ttl = header->rdh_ttl; /* Only account for stale TTL if cache is not overmem */ if (!cache_is_overmem) { rdh_ttl += STALE_TTL(header, rbtdb); } if (rdh_ttl < now - RBTDB_VIRTUAL) { expire_header(rbtdb, header, tree_locked, expire_ttl); } } /* * If we've been holding a write lock on the tree just for * cleaning, we can release it now. However, we still need the * node lock. */ if (tree_locked && !delegating && !newnsec) { RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); tree_locked = false; } } result = ISC_R_SUCCESS; if (newnsec) { dns_rbtnode_t *nsecnode; nsecnode = NULL; result = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode); if (result == ISC_R_SUCCESS) { nsecnode->nsec = DNS_RBT_NSEC_NSEC; rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC; } else if (result == ISC_R_EXISTS) { rbtnode->nsec = DNS_RBT_NSEC_HAS_NSEC; result = ISC_R_SUCCESS; } } if (result == ISC_R_SUCCESS) { result = add32(rbtdb, rbtnode, name, rbtversion, newheader, options, false, addedrdataset, now); } if (result == ISC_R_SUCCESS && delegating) { rbtnode->find_callback = 1; } NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); if (tree_locked) { RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } /* * Update the zone's secure status. If version is non-NULL * this is deferred until closeversion() is called. */ if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) { iszonesecure(db, version, rbtdb->origin_node); } return (result); } static isc_result_t subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, dns_rdataset_t *rdataset, unsigned int options, dns_rdataset_t *newrdataset) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rbtdb_version_t *rbtversion = version; dns_fixedname_t fname; dns_name_t *nodename = dns_fixedname_initname(&fname); rdatasetheader_t *topheader, *topheader_prev, *header, *newheader; unsigned char *subresult; isc_region_t region; isc_result_t result; rbtdb_changed_t *changed; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(rbtversion != NULL && rbtversion->rbtdb == rbtdb); if (rbtdb->common.methods == &zone_methods) { RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); REQUIRE(((rbtnode->nsec == DNS_RBT_NSEC_NSEC3 && (rdataset->type == dns_rdatatype_nsec3 || rdataset->covers == dns_rdatatype_nsec3)) || (rbtnode->nsec != DNS_RBT_NSEC_NSEC3 && rdataset->type != dns_rdatatype_nsec3 && rdataset->covers != dns_rdatatype_nsec3))); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); } nodefullname(db, node, nodename); result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx, ®ion, sizeof(rdatasetheader_t)); if (result != ISC_R_SUCCESS) { return (result); } newheader = (rdatasetheader_t *)region.base; init_rdataset(rbtdb, newheader); set_ttl(rbtdb, newheader, rdataset->ttl); newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type, rdataset->covers); atomic_init(&newheader->attributes, 0); newheader->serial = rbtversion->serial; newheader->trust = 0; newheader->noqname = NULL; newheader->closest = NULL; atomic_init(&newheader->count, atomic_fetch_add_relaxed(&init_count, 1)); newheader->last_used = 0; newheader->node = rbtnode; if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN); newheader->resign = (isc_stdtime_t)(dns_time64_from32(rdataset->resign) >> 1); newheader->resign_lsb = rdataset->resign & 0x1; } else { newheader->resign = 0; newheader->resign_lsb = 0; } NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); changed = add_changed(rbtdb, rbtversion, rbtnode); if (changed == NULL) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); return (ISC_R_NOMEMORY); } topheader_prev = NULL; for (topheader = rbtnode->data; topheader != NULL; topheader = topheader->next) { if (topheader->type == newheader->type) { break; } topheader_prev = topheader; } /* * If header isn't NULL, we've found the right type. There may be * IGNORE rdatasets between the top of the chain and the first real * data. We skip over them. */ header = topheader; while (header != NULL && IGNORE(header)) { header = header->down; } if (header != NULL && EXISTS(header)) { unsigned int flags = 0; subresult = NULL; result = ISC_R_SUCCESS; if ((options & DNS_DBSUB_EXACT) != 0) { flags |= DNS_RDATASLAB_EXACT; if (newheader->rdh_ttl != header->rdh_ttl) { result = DNS_R_NOTEXACT; } } if (result == ISC_R_SUCCESS) { result = dns_rdataslab_subtract( (unsigned char *)header, (unsigned char *)newheader, (unsigned int)(sizeof(*newheader)), rbtdb->common.mctx, rbtdb->common.rdclass, (dns_rdatatype_t)header->type, flags, &subresult); } if (result == ISC_R_SUCCESS) { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); newheader = (rdatasetheader_t *)subresult; init_rdataset(rbtdb, newheader); update_newheader(newheader, header); if (RESIGN(header)) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN); newheader->resign = header->resign; newheader->resign_lsb = header->resign_lsb; resign_insert(rbtdb, rbtnode->locknum, newheader); } /* * We have to set the serial since the rdataslab * subtraction routine copies the reserved portion of * header, not newheader. */ newheader->serial = rbtversion->serial; /* * XXXJT: dns_rdataslab_subtract() copied the pointers * to additional info. We need to clear these fields * to avoid having duplicated references. */ update_recordsandxfrsize(true, rbtversion, newheader, nodename->length); } else if (result == DNS_R_NXRRSET) { /* * This subtraction would remove all of the rdata; * add a nonexistent header instead. */ free_rdataset(rbtdb, rbtdb->common.mctx, newheader); newheader = new_rdataset(rbtdb, rbtdb->common.mctx); if (newheader == NULL) { result = ISC_R_NOMEMORY; goto unlock; } init_rdataset(rbtdb, newheader); set_ttl(rbtdb, newheader, 0); newheader->type = topheader->type; atomic_init(&newheader->attributes, RDATASET_ATTR_NONEXISTENT); newheader->trust = 0; newheader->serial = rbtversion->serial; newheader->noqname = NULL; newheader->closest = NULL; atomic_init(&newheader->count, 0); newheader->node = rbtnode; newheader->resign = 0; newheader->resign_lsb = 0; newheader->last_used = 0; } else { free_rdataset(rbtdb, rbtdb->common.mctx, newheader); goto unlock; } /* * If we're here, we want to link newheader in front of * topheader. */ INSIST(rbtversion->serial >= topheader->serial); update_recordsandxfrsize(false, rbtversion, header, nodename->length); if (topheader_prev != NULL) { topheader_prev->next = newheader; } else { rbtnode->data = newheader; } newheader->next = topheader->next; newheader->down = topheader; topheader->next = newheader; rbtnode->dirty = 1; changed->dirty = true; resign_delete(rbtdb, rbtversion, header); } else { /* * The rdataset doesn't exist, so we don't need to do anything * to satisfy the deletion request. */ free_rdataset(rbtdb, rbtdb->common.mctx, newheader); if ((options & DNS_DBSUB_EXACT) != 0) { result = DNS_R_NOTEXACT; } else { result = DNS_R_UNCHANGED; } } if (result == ISC_R_SUCCESS && newrdataset != NULL) { bind_rdataset(rbtdb, rbtnode, newheader, 0, isc_rwlocktype_write, newrdataset); } if (result == DNS_R_NXRRSET && newrdataset != NULL && (options & DNS_DBSUB_WANTOLD) != 0) { bind_rdataset(rbtdb, rbtnode, header, 0, isc_rwlocktype_write, newrdataset); } unlock: NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); /* * Update the zone's secure status. If version is non-NULL * this is deferred until closeversion() is called. */ if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) { RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); version = rbtdb->current_version; RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); iszonesecure(db, version, rbtdb->origin_node); } return (result); } static isc_result_t deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, dns_rdatatype_t type, dns_rdatatype_t covers) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rbtdb_version_t *rbtversion = version; dns_fixedname_t fname; dns_name_t *nodename = dns_fixedname_initname(&fname); isc_result_t result; rdatasetheader_t *newheader; REQUIRE(VALID_RBTDB(rbtdb)); INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb); if (type == dns_rdatatype_any) { return (ISC_R_NOTIMPLEMENTED); } if (type == dns_rdatatype_rrsig && covers == 0) { return (ISC_R_NOTIMPLEMENTED); } newheader = new_rdataset(rbtdb, rbtdb->common.mctx); if (newheader == NULL) { return (ISC_R_NOMEMORY); } init_rdataset(rbtdb, newheader); set_ttl(rbtdb, newheader, 0); newheader->type = RBTDB_RDATATYPE_VALUE(type, covers); atomic_init(&newheader->attributes, RDATASET_ATTR_NONEXISTENT); newheader->trust = 0; newheader->noqname = NULL; newheader->closest = NULL; if (rbtversion != NULL) { newheader->serial = rbtversion->serial; } else { newheader->serial = 0; } atomic_init(&newheader->count, 0); newheader->last_used = 0; newheader->node = rbtnode; nodefullname(db, node, nodename); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); result = add32(rbtdb, rbtnode, nodename, rbtversion, newheader, DNS_DBADD_FORCE, false, NULL, 0); NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); /* * Update the zone's secure status. If version is non-NULL * this is deferred until closeversion() is called. */ if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb)) { RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); version = rbtdb->current_version; RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); iszonesecure(db, version, rbtdb->origin_node); } return (result); } /* * load a non-NSEC3 node in the main tree and optionally to the auxiliary NSEC */ static isc_result_t loadnode(dns_rbtdb_t *rbtdb, const dns_name_t *name, dns_rbtnode_t **nodep, bool hasnsec) { isc_result_t noderesult, nsecresult, tmpresult; dns_rbtnode_t *nsecnode = NULL, *node = NULL; noderesult = dns_rbt_addnode(rbtdb->tree, name, &node); if (!hasnsec) { goto done; } if (noderesult == ISC_R_EXISTS) { /* * Add a node to the auxiliary NSEC tree for an old node * just now getting an NSEC record. */ if (node->nsec == DNS_RBT_NSEC_HAS_NSEC) { goto done; } } else if (noderesult != ISC_R_SUCCESS) { goto done; } /* * Build the auxiliary tree for NSECs as we go. * This tree speeds searches for closest NSECs that would otherwise * need to examine many irrelevant nodes in large TLDs. * * Add nodes to the auxiliary tree after corresponding nodes have * been added to the main tree. */ nsecresult = dns_rbt_addnode(rbtdb->nsec, name, &nsecnode); if (nsecresult == ISC_R_SUCCESS) { nsecnode->nsec = DNS_RBT_NSEC_NSEC; node->nsec = DNS_RBT_NSEC_HAS_NSEC; goto done; } if (nsecresult == ISC_R_EXISTS) { #if 1 /* 0 */ isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, "addnode: NSEC node already exists"); #endif /* if 1 */ node->nsec = DNS_RBT_NSEC_HAS_NSEC; goto done; } if (noderesult == ISC_R_SUCCESS) { /* * Remove the node we just added above. */ tmpresult = dns_rbt_deletenode(rbtdb->tree, node, false); if (tmpresult != ISC_R_SUCCESS) { isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, "loading_addrdataset: " "dns_rbt_deletenode: %s after " "dns_rbt_addnode(NSEC): %s", isc_result_totext(tmpresult), isc_result_totext(noderesult)); } } /* * Set the error condition to be returned. */ noderesult = nsecresult; done: if (noderesult == ISC_R_SUCCESS || noderesult == ISC_R_EXISTS) { *nodep = node; } return (noderesult); } static isc_result_t loading_addrdataset(void *arg, const dns_name_t *name, dns_rdataset_t *rdataset) { rbtdb_load_t *loadctx = arg; dns_rbtdb_t *rbtdb = loadctx->rbtdb; dns_rbtnode_t *node; isc_result_t result; isc_region_t region; rdatasetheader_t *newheader; REQUIRE(rdataset->rdclass == rbtdb->common.rdclass); /* * SOA records are only allowed at top of zone. */ if (rdataset->type == dns_rdatatype_soa && !IS_CACHE(rbtdb) && !dns_name_equal(name, &rbtdb->common.origin)) { return (DNS_R_NOTZONETOP); } if (rdataset->type != dns_rdatatype_nsec3 && rdataset->covers != dns_rdatatype_nsec3) { add_empty_wildcards(rbtdb, name, false); } if (dns_name_iswildcard(name)) { /* * NS record owners cannot legally be wild cards. */ if (rdataset->type == dns_rdatatype_ns) { return (DNS_R_INVALIDNS); } /* * NSEC3 record owners cannot legally be wild cards. */ if (rdataset->type == dns_rdatatype_nsec3) { return (DNS_R_INVALIDNSEC3); } result = add_wildcard_magic(rbtdb, name, false); if (result != ISC_R_SUCCESS) { return (result); } } node = NULL; if (rdataset->type == dns_rdatatype_nsec3 || rdataset->covers == dns_rdatatype_nsec3) { result = dns_rbt_addnode(rbtdb->nsec3, name, &node); if (result == ISC_R_SUCCESS) { node->nsec = DNS_RBT_NSEC_NSEC3; } } else if (rdataset->type == dns_rdatatype_nsec) { result = loadnode(rbtdb, name, &node, true); } else { result = loadnode(rbtdb, name, &node, false); } if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS) { return (result); } if (result == ISC_R_SUCCESS) { node->locknum = node->hashval % rbtdb->node_lock_count; } result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx, ®ion, sizeof(rdatasetheader_t)); if (result != ISC_R_SUCCESS) { return (result); } newheader = (rdatasetheader_t *)region.base; init_rdataset(rbtdb, newheader); set_ttl(rbtdb, newheader, rdataset->ttl + loadctx->now); /* XXX overflow * check */ newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type, rdataset->covers); atomic_init(&newheader->attributes, 0); newheader->trust = rdataset->trust; newheader->serial = 1; newheader->noqname = NULL; newheader->closest = NULL; atomic_init(&newheader->count, atomic_fetch_add_relaxed(&init_count, 1)); newheader->last_used = 0; newheader->node = node; setownercase(newheader, name); if ((rdataset->attributes & DNS_RDATASETATTR_RESIGN) != 0) { RDATASET_ATTR_SET(newheader, RDATASET_ATTR_RESIGN); newheader->resign = (isc_stdtime_t)(dns_time64_from32(rdataset->resign) >> 1); newheader->resign_lsb = rdataset->resign & 0x1; } else { newheader->resign = 0; newheader->resign_lsb = 0; } NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write); result = add32(rbtdb, node, name, rbtdb->current_version, newheader, DNS_DBADD_MERGE, true, NULL, 0); NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write); if (result == ISC_R_SUCCESS && delegating_type(rbtdb, node, rdataset->type)) { node->find_callback = 1; } else if (result == DNS_R_UNCHANGED) { result = ISC_R_SUCCESS; } return (result); } static isc_result_t beginload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) { rbtdb_load_t *loadctx; dns_rbtdb_t *rbtdb; rbtdb = (dns_rbtdb_t *)db; REQUIRE(DNS_CALLBACK_VALID(callbacks)); REQUIRE(VALID_RBTDB(rbtdb)); loadctx = isc_mem_get(rbtdb->common.mctx, sizeof(*loadctx)); loadctx->rbtdb = rbtdb; if (IS_CACHE(rbtdb)) { isc_stdtime_get(&loadctx->now); } else { loadctx->now = 0; } RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE((rbtdb->attributes & (RBTDB_ATTR_LOADED | RBTDB_ATTR_LOADING)) == 0); rbtdb->attributes |= RBTDB_ATTR_LOADING; RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); callbacks->add = loading_addrdataset; callbacks->add_private = loadctx; return (ISC_R_SUCCESS); } static isc_result_t endload(dns_db_t *db, dns_rdatacallbacks_t *callbacks) { rbtdb_load_t *loadctx; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(DNS_CALLBACK_VALID(callbacks)); loadctx = callbacks->add_private; REQUIRE(loadctx != NULL); REQUIRE(loadctx->rbtdb == rbtdb); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0); REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0); rbtdb->attributes &= ~RBTDB_ATTR_LOADING; rbtdb->attributes |= RBTDB_ATTR_LOADED; /* * If there's a KEY rdataset at the zone origin containing a * zone key, we consider the zone secure. */ if (!IS_CACHE(rbtdb) && rbtdb->origin_node != NULL) { dns_dbversion_t *version = rbtdb->current_version; RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); iszonesecure(db, version, rbtdb->origin_node); } else { RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); } callbacks->add = NULL; callbacks->add_private = NULL; isc_mem_put(rbtdb->common.mctx, loadctx, sizeof(*loadctx)); return (ISC_R_SUCCESS); } static isc_result_t dump(dns_db_t *db, dns_dbversion_t *version, const char *filename, dns_masterformat_t masterformat) { dns_rbtdb_t *rbtdb; rbtdb_version_t *rbtversion = version; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb); return (dns_master_dump(rbtdb->common.mctx, db, version, &dns_master_style_default, filename, masterformat, NULL)); } static void delete_callback(void *data, void *arg) { dns_rbtdb_t *rbtdb = arg; rdatasetheader_t *current, *next; unsigned int locknum; current = data; locknum = current->node->locknum; NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); while (current != NULL) { next = current->next; free_rdataset(rbtdb, rbtdb->common.mctx, current); current = next; } NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); } static bool issecure(dns_db_t *db) { dns_rbtdb_t *rbtdb; bool secure; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); secure = (rbtdb->current_version->secure == dns_db_secure); RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); return (secure); } static bool isdnssec(dns_db_t *db) { dns_rbtdb_t *rbtdb; bool dnssec; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); dnssec = (rbtdb->current_version->secure != dns_db_insecure); RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); return (dnssec); } static unsigned int nodecount(dns_db_t *db, dns_dbtree_t tree) { dns_rbtdb_t *rbtdb; unsigned int count; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); switch (tree) { case dns_dbtree_main: count = dns_rbt_nodecount(rbtdb->tree); break; case dns_dbtree_nsec: count = dns_rbt_nodecount(rbtdb->nsec); break; case dns_dbtree_nsec3: count = dns_rbt_nodecount(rbtdb->nsec3); break; default: UNREACHABLE(); } RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); return (count); } static size_t hashsize(dns_db_t *db) { dns_rbtdb_t *rbtdb; size_t size; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); size = dns_rbt_hashsize(rbtdb->tree); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); return (size); } static void settask(dns_db_t *db, isc_task_t *task) { dns_rbtdb_t *rbtdb; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); if (rbtdb->task != NULL) { isc_task_detach(&rbtdb->task); } if (task != NULL) { isc_task_attach(task, &rbtdb->task); } RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); } static bool ispersistent(dns_db_t *db) { UNUSED(db); return (false); } static isc_result_t getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *onode; isc_result_t result = ISC_R_SUCCESS; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(nodep != NULL && *nodep == NULL); /* Note that the access to origin_node doesn't require a DB lock */ onode = (dns_rbtnode_t *)rbtdb->origin_node; if (onode != NULL) { new_reference(rbtdb, onode, isc_rwlocktype_none); *nodep = rbtdb->origin_node; } else { INSIST(IS_CACHE(rbtdb)); result = ISC_R_NOTFOUND; } return (result); } static isc_result_t getnsec3parameters(dns_db_t *db, dns_dbversion_t *version, dns_hash_t *hash, uint8_t *flags, uint16_t *iterations, unsigned char *salt, size_t *salt_length) { dns_rbtdb_t *rbtdb; isc_result_t result = ISC_R_NOTFOUND; rbtdb_version_t *rbtversion = version; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); if (rbtversion == NULL) { rbtversion = rbtdb->current_version; } if (rbtversion->havensec3) { if (hash != NULL) { *hash = rbtversion->hash; } if (salt != NULL && salt_length != NULL) { REQUIRE(*salt_length >= rbtversion->salt_length); memmove(salt, rbtversion->salt, rbtversion->salt_length); } if (salt_length != NULL) { *salt_length = rbtversion->salt_length; } if (iterations != NULL) { *iterations = rbtversion->iterations; } if (flags != NULL) { *flags = rbtversion->flags; } result = ISC_R_SUCCESS; } RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); return (result); } static isc_result_t getsize(dns_db_t *db, dns_dbversion_t *version, uint64_t *records, uint64_t *xfrsize) { dns_rbtdb_t *rbtdb; isc_result_t result = ISC_R_SUCCESS; rbtdb_version_t *rbtversion = version; rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); INSIST(rbtversion == NULL || rbtversion->rbtdb == rbtdb); RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); if (rbtversion == NULL) { rbtversion = rbtdb->current_version; } RWLOCK(&rbtversion->rwlock, isc_rwlocktype_read); if (records != NULL) { *records = rbtversion->records; } if (xfrsize != NULL) { *xfrsize = rbtversion->xfrsize; } RWUNLOCK(&rbtversion->rwlock, isc_rwlocktype_read); RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); return (result); } static isc_result_t setsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, isc_stdtime_t resign) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rdatasetheader_t *header, oldheader; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(!IS_CACHE(rbtdb)); REQUIRE(rdataset != NULL); header = rdataset->private3; header--; NODE_LOCK(&rbtdb->node_locks[header->node->locknum].lock, isc_rwlocktype_write); oldheader = *header; /* * Only break the heap invariant (by adjusting resign and resign_lsb) * if we are going to be restoring it by calling isc_heap_increased * or isc_heap_decreased. */ if (resign != 0) { header->resign = (isc_stdtime_t)(dns_time64_from32(resign) >> 1); header->resign_lsb = resign & 0x1; } if (header->heap_index != 0) { INSIST(RESIGN(header)); if (resign == 0) { isc_heap_delete(rbtdb->heaps[header->node->locknum], header->heap_index); header->heap_index = 0; } else if (resign_sooner(header, &oldheader)) { isc_heap_increased(rbtdb->heaps[header->node->locknum], header->heap_index); } else if (resign_sooner(&oldheader, header)) { isc_heap_decreased(rbtdb->heaps[header->node->locknum], header->heap_index); } } else if (resign != 0) { RDATASET_ATTR_SET(header, RDATASET_ATTR_RESIGN); resign_insert(rbtdb, header->node->locknum, header); } NODE_UNLOCK(&rbtdb->node_locks[header->node->locknum].lock, isc_rwlocktype_write); return (ISC_R_SUCCESS); } static isc_result_t getsigningtime(dns_db_t *db, dns_rdataset_t *rdataset, dns_name_t *foundname) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rdatasetheader_t *header = NULL, *this; unsigned int i; isc_result_t result = ISC_R_NOTFOUND; unsigned int locknum = 0; REQUIRE(VALID_RBTDB(rbtdb)); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); for (i = 0; i < rbtdb->node_lock_count; i++) { NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_read); /* * Find for the earliest signing time among all of the * heaps, each of which is covered by a different bucket * lock. */ this = isc_heap_element(rbtdb->heaps[i], 1); if (this == NULL) { /* Nothing found; unlock and try the next heap. */ NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_read); continue; } if (header == NULL) { /* * Found a signing time: retain the bucket lock and * preserve the lock number so we can unlock it * later. */ header = this; locknum = i; } else if (resign_sooner(this, header)) { /* * Found an earlier signing time; release the * previous bucket lock and retain this one instead. */ NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_read); header = this; locknum = i; } else { /* * Earliest signing time in this heap isn't * an improvement; unlock and try the next heap. */ NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_read); } } if (header != NULL) { /* * Found something; pass back the answer and unlock * the bucket. */ bind_rdataset(rbtdb, header->node, header, 0, isc_rwlocktype_read, rdataset); if (foundname != NULL) { dns_rbt_fullnamefromnode(header->node, foundname); } NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_read); result = ISC_R_SUCCESS; } RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); return (result); } static void resigned(dns_db_t *db, dns_rdataset_t *rdataset, dns_dbversion_t *version) { rbtdb_version_t *rbtversion = (rbtdb_version_t *)version; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *node; rdatasetheader_t *header; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(rdataset != NULL); REQUIRE(rdataset->methods == &rdataset_methods); REQUIRE(rbtdb->future_version == rbtversion); REQUIRE(rbtversion != NULL); REQUIRE(rbtversion->writer); REQUIRE(rbtversion->rbtdb == rbtdb); node = rdataset->private2; INSIST(node != NULL); header = rdataset->private3; INSIST(header != NULL); header--; if (header->heap_index == 0) { return; } RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); NODE_LOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write); /* * Delete from heap and save to re-signed list so that it can * be restored if we backout of this change. */ resign_delete(rbtdb, rbtversion, header); NODE_UNLOCK(&rbtdb->node_locks[node->locknum].lock, isc_rwlocktype_write); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } static isc_result_t setcachestats(dns_db_t *db, isc_stats_t *stats) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(IS_CACHE(rbtdb)); /* current restriction */ REQUIRE(stats != NULL); isc_stats_attach(stats, &rbtdb->cachestats); return (ISC_R_SUCCESS); } static isc_result_t setgluecachestats(dns_db_t *db, isc_stats_t *stats) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb)); REQUIRE(stats != NULL); isc_stats_attach(stats, &rbtdb->gluecachestats); return (ISC_R_SUCCESS); } static dns_stats_t * getrrsetstats(dns_db_t *db) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(IS_CACHE(rbtdb)); /* current restriction */ return (rbtdb->rrsetstats); } static isc_result_t nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; isc_result_t result; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(node != NULL); REQUIRE(name != NULL); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); result = dns_rbt_fullnamefromnode(rbtnode, name); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); return (result); } static isc_result_t setservestalettl(dns_db_t *db, dns_ttl_t ttl) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(IS_CACHE(rbtdb)); /* currently no bounds checking. 0 means disable. */ rbtdb->serve_stale_ttl = ttl; return (ISC_R_SUCCESS); } static isc_result_t getservestalettl(dns_db_t *db, dns_ttl_t *ttl) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(IS_CACHE(rbtdb)); *ttl = rbtdb->serve_stale_ttl; return (ISC_R_SUCCESS); } static isc_result_t setservestalerefresh(dns_db_t *db, uint32_t interval) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(IS_CACHE(rbtdb)); /* currently no bounds checking. 0 means disable. */ rbtdb->serve_stale_refresh = interval; return (ISC_R_SUCCESS); } static isc_result_t getservestalerefresh(dns_db_t *db, uint32_t *interval) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(IS_CACHE(rbtdb)); *interval = rbtdb->serve_stale_refresh; return (ISC_R_SUCCESS); } static dns_dbmethods_t zone_methods = { attach, detach, beginload, endload, dump, currentversion, newversion, attachversion, closeversion, findnode, zone_find, zone_findzonecut, attachnode, detachnode, expirenode, printnode, createiterator, zone_findrdataset, allrdatasets, addrdataset, subtractrdataset, deleterdataset, issecure, nodecount, ispersistent, overmem, settask, getoriginnode, NULL, /* transfernode */ getnsec3parameters, findnsec3node, setsigningtime, getsigningtime, resigned, isdnssec, NULL, /* getrrsetstats */ NULL, /* rpz_attach */ NULL, /* rpz_ready */ NULL, /* findnodeext */ NULL, /* findext */ NULL, /* setcachestats */ hashsize, nodefullname, getsize, NULL, /* setservestalettl */ NULL, /* getservestalettl */ NULL, /* setservestalerefresh */ NULL, /* getservestalerefresh */ setgluecachestats }; static dns_dbmethods_t cache_methods = { attach, detach, beginload, endload, dump, currentversion, newversion, attachversion, closeversion, findnode, cache_find, cache_findzonecut, attachnode, detachnode, expirenode, printnode, createiterator, cache_findrdataset, allrdatasets, addrdataset, subtractrdataset, deleterdataset, issecure, nodecount, ispersistent, overmem, settask, getoriginnode, NULL, /* transfernode */ NULL, /* getnsec3parameters */ NULL, /* findnsec3node */ NULL, /* setsigningtime */ NULL, /* getsigningtime */ NULL, /* resigned */ isdnssec, getrrsetstats, NULL, /* rpz_attach */ NULL, /* rpz_ready */ NULL, /* findnodeext */ NULL, /* findext */ setcachestats, hashsize, nodefullname, NULL, /* getsize */ setservestalettl, getservestalettl, setservestalerefresh, getservestalerefresh, NULL }; isc_result_t dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type, dns_rdataclass_t rdclass, unsigned int argc, char *argv[], void *driverarg, dns_db_t **dbp) { dns_rbtdb_t *rbtdb; isc_result_t result; int i; dns_name_t name; bool (*sooner)(void *, void *); isc_mem_t *hmctx = mctx; /* Keep the compiler happy. */ UNUSED(driverarg); rbtdb = isc_mem_get(mctx, sizeof(*rbtdb)); /* * If argv[0] exists, it points to a memory context to use for heap */ if (argc != 0) { hmctx = (isc_mem_t *)argv[0]; } memset(rbtdb, '\0', sizeof(*rbtdb)); dns_name_init(&rbtdb->common.origin, NULL); rbtdb->common.attributes = 0; if (type == dns_dbtype_cache) { rbtdb->common.methods = &cache_methods; rbtdb->common.attributes |= DNS_DBATTR_CACHE; } else if (type == dns_dbtype_stub) { rbtdb->common.methods = &zone_methods; rbtdb->common.attributes |= DNS_DBATTR_STUB; } else { rbtdb->common.methods = &zone_methods; } rbtdb->common.rdclass = rdclass; rbtdb->common.mctx = NULL; ISC_LIST_INIT(rbtdb->common.update_listeners); RBTDB_INITLOCK(&rbtdb->lock); isc_rwlock_init(&rbtdb->tree_lock, 0, 0); /* * Initialize node_lock_count in a generic way to support future * extension which allows the user to specify this value on creation. * Note that when specified for a cache DB it must be larger than 1 * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT. */ if (rbtdb->node_lock_count == 0) { if (IS_CACHE(rbtdb)) { rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT; } else { rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT; } } else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) { result = ISC_R_RANGE; goto cleanup_tree_lock; } INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH)); rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); rbtdb->cachestats = NULL; rbtdb->gluecachestats = NULL; rbtdb->rrsetstats = NULL; if (IS_CACHE(rbtdb)) { result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats); if (result != ISC_R_SUCCESS) { goto cleanup_node_locks; } rbtdb->rdatasets = isc_mem_get( mctx, rbtdb->node_lock_count * sizeof(rdatasetheaderlist_t)); for (i = 0; i < (int)rbtdb->node_lock_count; i++) { ISC_LIST_INIT(rbtdb->rdatasets[i]); } } else { rbtdb->rdatasets = NULL; } /* * Create the heaps. */ rbtdb->heaps = isc_mem_get(hmctx, rbtdb->node_lock_count * sizeof(isc_heap_t *)); for (i = 0; i < (int)rbtdb->node_lock_count; i++) { rbtdb->heaps[i] = NULL; } sooner = IS_CACHE(rbtdb) ? ttl_sooner : resign_sooner; for (i = 0; i < (int)rbtdb->node_lock_count; i++) { isc_heap_create(hmctx, sooner, set_index, 0, &rbtdb->heaps[i]); } /* * Create deadnode lists. */ rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count * sizeof(rbtnodelist_t)); for (i = 0; i < (int)rbtdb->node_lock_count; i++) { ISC_LIST_INIT(rbtdb->deadnodes[i]); } ISC_LIST_INIT(rbtdb->prunenodes); rbtdb->active = rbtdb->node_lock_count; for (i = 0; i < (int)(rbtdb->node_lock_count); i++) { NODE_INITLOCK(&rbtdb->node_locks[i].lock); isc_refcount_init(&rbtdb->node_locks[i].references, 0); rbtdb->node_locks[i].exiting = false; } /* * Attach to the mctx. The database will persist so long as there * are references to it, and attaching to the mctx ensures that our * mctx won't disappear out from under us. */ isc_mem_attach(mctx, &rbtdb->common.mctx); isc_mem_attach(hmctx, &rbtdb->hmctx); /* * Make a copy of the origin name. */ result = dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin); if (result != ISC_R_SUCCESS) { free_rbtdb(rbtdb, false, NULL); return (result); } /* * Make the Red-Black Trees. */ result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree); if (result != ISC_R_SUCCESS) { free_rbtdb(rbtdb, false, NULL); return (result); } result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec); if (result != ISC_R_SUCCESS) { free_rbtdb(rbtdb, false, NULL); return (result); } result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->nsec3); if (result != ISC_R_SUCCESS) { free_rbtdb(rbtdb, false, NULL); return (result); } /* * In order to set the node callback bit correctly in zone databases, * we need to know if the node has the origin name of the zone. * In loading_addrdataset() we could simply compare the new name * to the origin name, but this is expensive. Also, we don't know the * node name in addrdataset(), so we need another way of knowing the * zone's top. * * We now explicitly create a node for the zone's origin, and then * we simply remember the node's address. This is safe, because * the top-of-zone node can never be deleted, nor can its address * change. */ if (!IS_CACHE(rbtdb)) { rbtdb->origin_node = NULL; result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin, &rbtdb->origin_node); if (result != ISC_R_SUCCESS) { INSIST(result != ISC_R_EXISTS); free_rbtdb(rbtdb, false, NULL); return (result); } INSIST(rbtdb->origin_node != NULL); rbtdb->origin_node->nsec = DNS_RBT_NSEC_NORMAL; /* * We need to give the origin node the right locknum. */ dns_name_init(&name, NULL); dns_rbt_namefromnode(rbtdb->origin_node, &name); rbtdb->origin_node->locknum = rbtdb->origin_node->hashval % rbtdb->node_lock_count; /* * Add an apex node to the NSEC3 tree so that NSEC3 searches * return partial matches when there is only a single NSEC3 * record in the tree. */ rbtdb->nsec3_origin_node = NULL; result = dns_rbt_addnode(rbtdb->nsec3, &rbtdb->common.origin, &rbtdb->nsec3_origin_node); if (result != ISC_R_SUCCESS) { INSIST(result != ISC_R_EXISTS); free_rbtdb(rbtdb, false, NULL); return (result); } rbtdb->nsec3_origin_node->nsec = DNS_RBT_NSEC_NSEC3; /* * We need to give the nsec3 origin node the right locknum. */ dns_name_init(&name, NULL); dns_rbt_namefromnode(rbtdb->nsec3_origin_node, &name); rbtdb->nsec3_origin_node->locknum = rbtdb->nsec3_origin_node->hashval % rbtdb->node_lock_count; } /* * Misc. Initialization. */ isc_refcount_init(&rbtdb->references, 1); rbtdb->attributes = 0; rbtdb->task = NULL; rbtdb->serve_stale_ttl = 0; /* * Version Initialization. */ rbtdb->current_serial = 1; rbtdb->least_serial = 1; rbtdb->next_serial = 2; rbtdb->current_version = allocate_version(mctx, 1, 1, false); rbtdb->current_version->rbtdb = rbtdb; rbtdb->current_version->secure = dns_db_insecure; rbtdb->current_version->havensec3 = false; rbtdb->current_version->flags = 0; rbtdb->current_version->iterations = 0; rbtdb->current_version->hash = 0; rbtdb->current_version->salt_length = 0; memset(rbtdb->current_version->salt, 0, sizeof(rbtdb->current_version->salt)); isc_rwlock_init(&rbtdb->current_version->rwlock, 0, 0); rbtdb->current_version->records = 0; rbtdb->current_version->xfrsize = 0; rbtdb->future_version = NULL; ISC_LIST_INIT(rbtdb->open_versions); /* * Keep the current version in the open list so that list operation * won't happen in normal lookup operations. */ PREPEND(rbtdb->open_versions, rbtdb->current_version, link); rbtdb->common.magic = DNS_DB_MAGIC; rbtdb->common.impmagic = RBTDB_MAGIC; *dbp = (dns_db_t *)rbtdb; return (ISC_R_SUCCESS); cleanup_node_locks: isc_mem_put(mctx, rbtdb->node_locks, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); cleanup_tree_lock: isc_rwlock_destroy(&rbtdb->tree_lock); RBTDB_DESTROYLOCK(&rbtdb->lock); isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); return (result); } /* * Slabbed Rdataset Methods */ static void rdataset_disassociate(dns_rdataset_t *rdataset) { dns_db_t *db = rdataset->private1; dns_dbnode_t *node = rdataset->private2; detachnode(db, &node); } static isc_result_t rdataset_first(dns_rdataset_t *rdataset) { unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int count; count = raw[0] * 256 + raw[1]; if (count == 0) { rdataset->private5 = NULL; return (ISC_R_NOMORE); } if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) { raw += DNS_RDATASET_COUNT; } raw += DNS_RDATASET_LENGTH; /* * The privateuint4 field is the number of rdata beyond the * cursor position, so we decrement the total count by one * before storing it. * * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the * first record. If DNS_RDATASETATTR_LOADORDER is set 'raw' points * to the first entry in the offset table. */ count--; rdataset->privateuint4 = count; rdataset->private5 = raw; return (ISC_R_SUCCESS); } static isc_result_t rdataset_next(dns_rdataset_t *rdataset) { unsigned int count; unsigned int length; unsigned char *raw; /* RDATASLAB */ count = rdataset->privateuint4; if (count == 0) { return (ISC_R_NOMORE); } count--; rdataset->privateuint4 = count; /* * Skip forward one record (length + 4) or one offset (4). */ raw = rdataset->private5; #if DNS_RDATASET_FIXED if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) #endif /* DNS_RDATASET_FIXED */ { length = raw[0] * 256 + raw[1]; raw += length; } rdataset->private5 = raw + DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH; return (ISC_R_SUCCESS); } static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) { unsigned char *raw = rdataset->private5; /* RDATASLAB */ unsigned int length; isc_region_t r; unsigned int flags = 0; REQUIRE(raw != NULL); /* * Find the start of the record if not already in private5 * then skip the length and order fields. */ #if DNS_RDATASET_FIXED if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) { unsigned int offset; offset = ((unsigned int)raw[0] << 24) + ((unsigned int)raw[1] << 16) + ((unsigned int)raw[2] << 8) + (unsigned int)raw[3]; raw = rdataset->private3; raw += offset; } #endif /* if DNS_RDATASET_FIXED */ length = raw[0] * 256 + raw[1]; raw += DNS_RDATASET_ORDER + DNS_RDATASET_LENGTH; if (rdataset->type == dns_rdatatype_rrsig) { if (*raw & DNS_RDATASLAB_OFFLINE) { flags |= DNS_RDATA_OFFLINE; } length--; raw++; } r.length = length; r.base = raw; dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r); rdata->flags |= flags; } static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) { dns_db_t *db = source->private1; dns_dbnode_t *node = source->private2; dns_dbnode_t *cloned_node = NULL; attachnode(db, node, &cloned_node); INSIST(!ISC_LINK_LINKED(target, link)); *target = *source; ISC_LINK_INIT(target, link); /* * Reset iterator state. */ target->privateuint4 = 0; target->private5 = NULL; } static unsigned int rdataset_count(dns_rdataset_t *rdataset) { unsigned char *raw = rdataset->private3; /* RDATASLAB */ unsigned int count; count = raw[0] * 256 + raw[1]; return (count); } static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name, dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) { dns_db_t *db = rdataset->private1; dns_dbnode_t *node = rdataset->private2; dns_dbnode_t *cloned_node; const struct noqname *noqname = rdataset->private6; cloned_node = NULL; attachnode(db, node, &cloned_node); nsec->methods = &slab_methods; nsec->rdclass = db->rdclass; nsec->type = noqname->type; nsec->covers = 0; nsec->ttl = rdataset->ttl; nsec->trust = rdataset->trust; nsec->private1 = rdataset->private1; nsec->private2 = rdataset->private2; nsec->private3 = noqname->neg; nsec->privateuint4 = 0; nsec->private5 = NULL; nsec->private6 = NULL; nsec->private7 = NULL; cloned_node = NULL; attachnode(db, node, &cloned_node); nsecsig->methods = &slab_methods; nsecsig->rdclass = db->rdclass; nsecsig->type = dns_rdatatype_rrsig; nsecsig->covers = noqname->type; nsecsig->ttl = rdataset->ttl; nsecsig->trust = rdataset->trust; nsecsig->private1 = rdataset->private1; nsecsig->private2 = rdataset->private2; nsecsig->private3 = noqname->negsig; nsecsig->privateuint4 = 0; nsecsig->private5 = NULL; nsec->private6 = NULL; nsec->private7 = NULL; dns_name_clone(&noqname->name, name); return (ISC_R_SUCCESS); } static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name, dns_rdataset_t *nsec, dns_rdataset_t *nsecsig) { dns_db_t *db = rdataset->private1; dns_dbnode_t *node = rdataset->private2; dns_dbnode_t *cloned_node; const struct noqname *closest = rdataset->private7; cloned_node = NULL; attachnode(db, node, &cloned_node); nsec->methods = &slab_methods; nsec->rdclass = db->rdclass; nsec->type = closest->type; nsec->covers = 0; nsec->ttl = rdataset->ttl; nsec->trust = rdataset->trust; nsec->private1 = rdataset->private1; nsec->private2 = rdataset->private2; nsec->private3 = closest->neg; nsec->privateuint4 = 0; nsec->private5 = NULL; nsec->private6 = NULL; nsec->private7 = NULL; cloned_node = NULL; attachnode(db, node, &cloned_node); nsecsig->methods = &slab_methods; nsecsig->rdclass = db->rdclass; nsecsig->type = dns_rdatatype_rrsig; nsecsig->covers = closest->type; nsecsig->ttl = rdataset->ttl; nsecsig->trust = rdataset->trust; nsecsig->private1 = rdataset->private1; nsecsig->private2 = rdataset->private2; nsecsig->private3 = closest->negsig; nsecsig->privateuint4 = 0; nsecsig->private5 = NULL; nsec->private6 = NULL; nsec->private7 = NULL; dns_name_clone(&closest->name, name); return (ISC_R_SUCCESS); } static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust) { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; rdatasetheader_t *header = rdataset->private3; header--; NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); header->trust = rdataset->trust = trust; NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); } static void rdataset_expire(dns_rdataset_t *rdataset) { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; rdatasetheader_t *header = rdataset->private3; header--; NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); expire_header(rbtdb, header, false, expire_flush); NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); } static void rdataset_clearprefetch(dns_rdataset_t *rdataset) { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; rdatasetheader_t *header = rdataset->private3; header--; NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); RDATASET_ATTR_CLR(header, RDATASET_ATTR_PREFETCH); NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); } /* * Rdataset Iterator Methods */ static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp) { rbtdb_rdatasetiter_t *rbtiterator; rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp); if (rbtiterator->common.version != NULL) { closeversion(rbtiterator->common.db, &rbtiterator->common.version, false); } detachnode(rbtiterator->common.db, &rbtiterator->common.node); isc_mem_put(rbtiterator->common.db->mctx, rbtiterator, sizeof(*rbtiterator)); *iteratorp = NULL; } static bool iterator_active(dns_rbtdb_t *rbtdb, rbtdb_rdatasetiter_t *rbtiterator, rdatasetheader_t *header) { dns_ttl_t stale_ttl = header->rdh_ttl + STALE_TTL(header, rbtdb); /* * Is this a "this rdataset doesn't exist" record? */ if (NONEXISTENT(header)) { return (false); } /* * If this is a zone or this header still active then return it. */ if (!IS_CACHE(rbtdb) || ACTIVE(header, rbtiterator->common.now)) { return (true); } /* * If we are not returning stale records or the rdataset is * too old don't return it. */ if (!STALEOK(rbtiterator) || (rbtiterator->common.now > stale_ttl)) { return (false); } return (true); } static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator) { rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db); dns_rbtnode_t *rbtnode = rbtiterator->common.node; rbtdb_version_t *rbtversion = rbtiterator->common.version; rdatasetheader_t *header, *top_next; rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial; NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); for (header = rbtnode->data; header != NULL; header = top_next) { top_next = header->next; do { if (EXPIREDOK(rbtiterator)) { if (!NONEXISTENT(header)) { break; } header = header->down; } else if (header->serial <= serial && !IGNORE(header)) { if (!iterator_active(rbtdb, rbtiterator, header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { break; } } NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); rbtiterator->current = header; if (header == NULL) { return (ISC_R_NOMORE); } return (ISC_R_SUCCESS); } static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator) { rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db); dns_rbtnode_t *rbtnode = rbtiterator->common.node; rbtdb_version_t *rbtversion = rbtiterator->common.version; rdatasetheader_t *header, *top_next; rbtdb_serial_t serial = IS_CACHE(rbtdb) ? 1 : rbtversion->serial; rbtdb_rdatatype_t type, negtype; dns_rdatatype_t rdtype, covers; bool expiredok = EXPIREDOK(rbtiterator); header = rbtiterator->current; if (header == NULL) { return (ISC_R_NOMORE); } NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); type = header->type; rdtype = RBTDB_RDATATYPE_BASE(header->type); if (NEGATIVE(header)) { covers = RBTDB_RDATATYPE_EXT(header->type); negtype = RBTDB_RDATATYPE_VALUE(covers, 0); } else { negtype = RBTDB_RDATATYPE_VALUE(0, rdtype); } /* * Find the start of the header chain for the next type * by walking back up the list. */ top_next = header->next; while (top_next != NULL && (top_next->type == type || top_next->type == negtype)) { top_next = top_next->next; } if (expiredok) { /* * Keep walking down the list if possible or * start the next type. */ header = header->down != NULL ? header->down : top_next; } else { header = top_next; } for (; header != NULL; header = top_next) { top_next = header->next; do { if (expiredok) { if (!NONEXISTENT(header)) { break; } header = header->down; } else if (header->serial <= serial && !IGNORE(header)) { if (!iterator_active(rbtdb, rbtiterator, header)) { header = NULL; } break; } else { header = header->down; } } while (header != NULL); if (header != NULL) { break; } /* * Find the start of the header chain for the next type * by walking back up the list. */ while (top_next != NULL && (top_next->type == type || top_next->type == negtype)) { top_next = top_next->next; } } NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); rbtiterator->current = header; if (header == NULL) { return (ISC_R_NOMORE); } return (ISC_R_SUCCESS); } static void rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) { rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db); dns_rbtnode_t *rbtnode = rbtiterator->common.node; rdatasetheader_t *header; header = rbtiterator->current; REQUIRE(header != NULL); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now, isc_rwlocktype_read, rdataset); NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); } /* * Database Iterator Methods */ static void reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; dns_rbtnode_t *node = rbtdbiter->node; if (node == NULL) { return; } INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none); reactivate_node(rbtdb, node, rbtdbiter->tree_locked); } static void dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; dns_rbtnode_t *node = rbtdbiter->node; nodelock_t *lock; if (node == NULL) { return; } lock = &rbtdb->node_locks[node->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, rbtdbiter->tree_locked, false); NODE_UNLOCK(lock, isc_rwlocktype_read); rbtdbiter->node = NULL; } static void flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtnode_t *node; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; bool was_read_locked = false; nodelock_t *lock; int i; if (rbtdbiter->delcnt != 0) { /* * Note that "%d node of %d in tree" can report things like * "flush_deletions: 59 nodes of 41 in tree". This means * That some nodes appear on the deletions list more than * once. Only the last occurrence will actually be deleted. */ isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), "flush_deletions: %d nodes of %d in tree", rbtdbiter->delcnt, dns_rbt_nodecount(rbtdb->tree)); if (rbtdbiter->tree_locked == isc_rwlocktype_read) { RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); was_read_locked = true; } RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); rbtdbiter->tree_locked = isc_rwlocktype_write; for (i = 0; i < rbtdbiter->delcnt; i++) { node = rbtdbiter->deletions[i]; lock = &rbtdb->node_locks[node->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, rbtdbiter->tree_locked, false); NODE_UNLOCK(lock, isc_rwlocktype_read); } rbtdbiter->delcnt = 0; RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); if (was_read_locked) { RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); rbtdbiter->tree_locked = isc_rwlocktype_read; } else { rbtdbiter->tree_locked = isc_rwlocktype_none; } } } static void resume_iteration(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; REQUIRE(rbtdbiter->paused); REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); rbtdbiter->tree_locked = isc_rwlocktype_read; rbtdbiter->paused = false; } static void dbiterator_destroy(dns_dbiterator_t **iteratorp) { rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp); dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; dns_db_t *db = NULL; if (rbtdbiter->tree_locked == isc_rwlocktype_read) { RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); rbtdbiter->tree_locked = isc_rwlocktype_none; } else { INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none); } dereference_iter_node(rbtdbiter); flush_deletions(rbtdbiter); dns_db_attach(rbtdbiter->common.db, &db); dns_db_detach(&rbtdbiter->common.db); dns_rbtnodechain_reset(&rbtdbiter->chain); dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter)); dns_db_detach(&db); *iteratorp = NULL; } static isc_result_t dbiterator_first(dns_dbiterator_t *iterator) { isc_result_t result; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; dns_name_t *name, *origin; if (rbtdbiter->result != ISC_R_SUCCESS && rbtdbiter->result != ISC_R_NOTFOUND && rbtdbiter->result != DNS_R_PARTIALMATCH && rbtdbiter->result != ISC_R_NOMORE) { return (rbtdbiter->result); } if (rbtdbiter->paused) { resume_iteration(rbtdbiter); } dereference_iter_node(rbtdbiter); name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); dns_rbtnodechain_reset(&rbtdbiter->chain); dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); if (rbtdbiter->nsec3only) { rbtdbiter->current = &rbtdbiter->nsec3chain; result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->nsec3, name, origin); } else { rbtdbiter->current = &rbtdbiter->chain; result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->tree, name, origin); if (!rbtdbiter->nonsec3 && result == ISC_R_NOTFOUND) { rbtdbiter->current = &rbtdbiter->nsec3chain; result = dns_rbtnodechain_first( rbtdbiter->current, rbtdb->nsec3, name, origin); } } if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); if (result == ISC_R_SUCCESS) { rbtdbiter->new_origin = true; reference_iter_node(rbtdbiter); } } else { INSIST(result == ISC_R_NOTFOUND); result = ISC_R_NOMORE; /* The tree is empty. */ } rbtdbiter->result = result; if (result != ISC_R_SUCCESS) { ENSURE(!rbtdbiter->paused); } return (result); } static isc_result_t dbiterator_last(dns_dbiterator_t *iterator) { isc_result_t result; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; dns_name_t *name, *origin; if (rbtdbiter->result != ISC_R_SUCCESS && rbtdbiter->result != ISC_R_NOTFOUND && rbtdbiter->result != DNS_R_PARTIALMATCH && rbtdbiter->result != ISC_R_NOMORE) { return (rbtdbiter->result); } if (rbtdbiter->paused) { resume_iteration(rbtdbiter); } dereference_iter_node(rbtdbiter); name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); dns_rbtnodechain_reset(&rbtdbiter->chain); dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); result = ISC_R_NOTFOUND; if (rbtdbiter->nsec3only && !rbtdbiter->nonsec3) { rbtdbiter->current = &rbtdbiter->nsec3chain; result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->nsec3, name, origin); } if (!rbtdbiter->nsec3only && result == ISC_R_NOTFOUND) { rbtdbiter->current = &rbtdbiter->chain; result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree, name, origin); } if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) { result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); if (result == ISC_R_SUCCESS) { rbtdbiter->new_origin = true; reference_iter_node(rbtdbiter); } } else { INSIST(result == ISC_R_NOTFOUND); result = ISC_R_NOMORE; /* The tree is empty. */ } rbtdbiter->result = result; return (result); } static isc_result_t dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name) { isc_result_t result, tresult; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; dns_name_t *iname, *origin; if (rbtdbiter->result != ISC_R_SUCCESS && rbtdbiter->result != ISC_R_NOTFOUND && rbtdbiter->result != DNS_R_PARTIALMATCH && rbtdbiter->result != ISC_R_NOMORE) { return (rbtdbiter->result); } if (rbtdbiter->paused) { resume_iteration(rbtdbiter); } dereference_iter_node(rbtdbiter); iname = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); dns_rbtnodechain_reset(&rbtdbiter->chain); dns_rbtnodechain_reset(&rbtdbiter->nsec3chain); if (rbtdbiter->nsec3only) { rbtdbiter->current = &rbtdbiter->nsec3chain; result = dns_rbt_findnode(rbtdb->nsec3, name, NULL, &rbtdbiter->node, rbtdbiter->current, DNS_RBTFIND_EMPTYDATA, NULL, NULL); } else if (rbtdbiter->nonsec3) { rbtdbiter->current = &rbtdbiter->chain; result = dns_rbt_findnode(rbtdb->tree, name, NULL, &rbtdbiter->node, rbtdbiter->current, DNS_RBTFIND_EMPTYDATA, NULL, NULL); } else { /* * Stay on main chain if not found on either chain. */ rbtdbiter->current = &rbtdbiter->chain; result = dns_rbt_findnode(rbtdb->tree, name, NULL, &rbtdbiter->node, rbtdbiter->current, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result == DNS_R_PARTIALMATCH) { dns_rbtnode_t *node = NULL; tresult = dns_rbt_findnode( rbtdb->nsec3, name, NULL, &node, &rbtdbiter->nsec3chain, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (tresult == ISC_R_SUCCESS) { rbtdbiter->node = node; rbtdbiter->current = &rbtdbiter->nsec3chain; result = tresult; } } } if (result == ISC_R_SUCCESS || result == DNS_R_PARTIALMATCH) { tresult = dns_rbtnodechain_current(rbtdbiter->current, iname, origin, NULL); if (tresult == ISC_R_SUCCESS) { rbtdbiter->new_origin = true; reference_iter_node(rbtdbiter); } else { result = tresult; rbtdbiter->node = NULL; } } else { rbtdbiter->node = NULL; } rbtdbiter->result = (result == DNS_R_PARTIALMATCH) ? ISC_R_SUCCESS : result; return (result); } static isc_result_t dbiterator_prev(dns_dbiterator_t *iterator) { isc_result_t result; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_name_t *name, *origin; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; REQUIRE(rbtdbiter->node != NULL); if (rbtdbiter->result != ISC_R_SUCCESS) { return (rbtdbiter->result); } if (rbtdbiter->paused) { resume_iteration(rbtdbiter); } name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); result = dns_rbtnodechain_prev(rbtdbiter->current, name, origin); if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only && !rbtdbiter->nonsec3 && &rbtdbiter->nsec3chain == rbtdbiter->current) { rbtdbiter->current = &rbtdbiter->chain; dns_rbtnodechain_reset(rbtdbiter->current); result = dns_rbtnodechain_last(rbtdbiter->current, rbtdb->tree, name, origin); if (result == ISC_R_NOTFOUND) { result = ISC_R_NOMORE; } } dereference_iter_node(rbtdbiter); if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) { rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN); result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); } if (result == ISC_R_SUCCESS) { reference_iter_node(rbtdbiter); } rbtdbiter->result = result; return (result); } static isc_result_t dbiterator_next(dns_dbiterator_t *iterator) { isc_result_t result; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_name_t *name, *origin; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; REQUIRE(rbtdbiter->node != NULL); if (rbtdbiter->result != ISC_R_SUCCESS) { return (rbtdbiter->result); } if (rbtdbiter->paused) { resume_iteration(rbtdbiter); } name = dns_fixedname_name(&rbtdbiter->name); origin = dns_fixedname_name(&rbtdbiter->origin); result = dns_rbtnodechain_next(rbtdbiter->current, name, origin); if (result == ISC_R_NOMORE && !rbtdbiter->nsec3only && !rbtdbiter->nonsec3 && &rbtdbiter->chain == rbtdbiter->current) { rbtdbiter->current = &rbtdbiter->nsec3chain; dns_rbtnodechain_reset(rbtdbiter->current); result = dns_rbtnodechain_first(rbtdbiter->current, rbtdb->nsec3, name, origin); if (result == ISC_R_NOTFOUND) { result = ISC_R_NOMORE; } } dereference_iter_node(rbtdbiter); if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) { rbtdbiter->new_origin = (result == DNS_R_NEWORIGIN); result = dns_rbtnodechain_current(rbtdbiter->current, NULL, NULL, &rbtdbiter->node); } if (result == ISC_R_SUCCESS) { reference_iter_node(rbtdbiter); } rbtdbiter->result = result; return (result); } static isc_result_t dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep, dns_name_t *name) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_rbtnode_t *node = rbtdbiter->node; isc_result_t result; dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name); dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin); REQUIRE(rbtdbiter->result == ISC_R_SUCCESS); REQUIRE(rbtdbiter->node != NULL); if (rbtdbiter->paused) { resume_iteration(rbtdbiter); } if (name != NULL) { if (rbtdbiter->common.relative_names) { origin = NULL; } result = dns_name_concatenate(nodename, origin, name, NULL); if (result != ISC_R_SUCCESS) { return (result); } if (rbtdbiter->common.relative_names && rbtdbiter->new_origin) { result = DNS_R_NEWORIGIN; } } else { result = ISC_R_SUCCESS; } new_reference(rbtdb, node, isc_rwlocktype_none); *nodep = rbtdbiter->node; if (iterator->cleaning && result == ISC_R_SUCCESS) { isc_result_t expire_result; /* * If the deletion array is full, flush it before trying * to expire the current node. The current node can't * fully deleted while the iteration cursor is still on it. */ if (rbtdbiter->delcnt == DELETION_BATCH_MAX) { flush_deletions(rbtdbiter); } expire_result = expirenode(iterator->db, *nodep, 0); /* * expirenode() currently always returns success. */ if (expire_result == ISC_R_SUCCESS && node->down == NULL) { rbtdbiter->deletions[rbtdbiter->delcnt++] = node; isc_refcount_increment(&node->references); } } return (result); } static isc_result_t dbiterator_pause(dns_dbiterator_t *iterator) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db; rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; if (rbtdbiter->result != ISC_R_SUCCESS && rbtdbiter->result != ISC_R_NOTFOUND && rbtdbiter->result != DNS_R_PARTIALMATCH && rbtdbiter->result != ISC_R_NOMORE) { return (rbtdbiter->result); } if (rbtdbiter->paused) { return (ISC_R_SUCCESS); } rbtdbiter->paused = true; if (rbtdbiter->tree_locked != isc_rwlocktype_none) { INSIST(rbtdbiter->tree_locked == isc_rwlocktype_read); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read); rbtdbiter->tree_locked = isc_rwlocktype_none; } flush_deletions(rbtdbiter); return (ISC_R_SUCCESS); } static isc_result_t dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) { rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator; dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin); if (rbtdbiter->result != ISC_R_SUCCESS) { return (rbtdbiter->result); } dns_name_copy(origin, name); return (ISC_R_SUCCESS); } static void setownercase(rdatasetheader_t *header, const dns_name_t *name) { unsigned int i; bool fully_lower; /* * We do not need to worry about label lengths as they are all * less than or equal to 63. */ memset(header->upper, 0, sizeof(header->upper)); fully_lower = true; for (i = 0; i < name->length; i++) { if (isupper(name->ndata[i])) { header->upper[i / 8] |= 1 << (i % 8); fully_lower = false; } } RDATASET_ATTR_SET(header, RDATASET_ATTR_CASESET); if (fully_lower) { RDATASET_ATTR_SET(header, RDATASET_ATTR_CASEFULLYLOWER); } } static void rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name) { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; unsigned char *raw = rdataset->private3; /* RDATASLAB */ rdatasetheader_t *header; header = (struct rdatasetheader *)(raw - sizeof(*header)); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); setownercase(header, name); NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_write); } static void rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name) { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *rbtnode = rdataset->private2; unsigned char *raw = rdataset->private3; /* RDATASLAB */ rdatasetheader_t *header = NULL; uint8_t mask = (1 << 7); uint8_t bits = 0; header = (struct rdatasetheader *)(raw - sizeof(*header)); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); if (!CASESET(header)) { goto unlock; } if (CASEFULLYLOWER(header)) { for (size_t i = 0; i < name->length; i++) { name->ndata[i] = tolower(name->ndata[i]); } } else { for (size_t i = 0; i < name->length; i++) { if (mask == (1 << 7)) { bits = header->upper[i / 8]; mask = 1; } else { mask <<= 1; } name->ndata[i] = ((bits & mask) != 0) ? toupper(name->ndata[i]) : tolower(name->ndata[i]); } } unlock: NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, isc_rwlocktype_read); } struct rbtdb_glue { struct rbtdb_glue *next; dns_fixedname_t fixedname; dns_rdataset_t rdataset_a; dns_rdataset_t sigrdataset_a; dns_rdataset_t rdataset_aaaa; dns_rdataset_t sigrdataset_aaaa; }; typedef struct { rbtdb_glue_t *glue_list; dns_rbtdb_t *rbtdb; rbtdb_version_t *rbtversion; } rbtdb_glue_additionaldata_ctx_t; static void free_gluelist(rbtdb_glue_t *glue_list, dns_rbtdb_t *rbtdb) { rbtdb_glue_t *cur, *cur_next; if (glue_list == (void *)-1) { return; } cur = glue_list; while (cur != NULL) { cur_next = cur->next; if (dns_rdataset_isassociated(&cur->rdataset_a)) { dns_rdataset_disassociate(&cur->rdataset_a); } if (dns_rdataset_isassociated(&cur->sigrdataset_a)) { dns_rdataset_disassociate(&cur->sigrdataset_a); } if (dns_rdataset_isassociated(&cur->rdataset_aaaa)) { dns_rdataset_disassociate(&cur->rdataset_aaaa); } if (dns_rdataset_isassociated(&cur->sigrdataset_aaaa)) { dns_rdataset_disassociate(&cur->sigrdataset_aaaa); } dns_rdataset_invalidate(&cur->rdataset_a); dns_rdataset_invalidate(&cur->sigrdataset_a); dns_rdataset_invalidate(&cur->rdataset_aaaa); dns_rdataset_invalidate(&cur->sigrdataset_aaaa); isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur)); cur = cur_next; } } static void free_gluetable(rbtdb_version_t *version) { dns_rbtdb_t *rbtdb; size_t size, i; RWLOCK(&version->glue_rwlock, isc_rwlocktype_write); rbtdb = version->rbtdb; for (i = 0; i < HASHSIZE(version->glue_table_bits); i++) { rbtdb_glue_table_node_t *cur, *cur_next; cur = version->glue_table[i]; while (cur != NULL) { cur_next = cur->next; /* isc_refcount_decrement(&cur->node->references); */ cur->node = NULL; free_gluelist(cur->glue_list, rbtdb); cur->glue_list = NULL; isc_mem_put(rbtdb->common.mctx, cur, sizeof(*cur)); cur = cur_next; } version->glue_table[i] = NULL; } size = HASHSIZE(version->glue_table_bits) * sizeof(*version->glue_table); isc_mem_put(rbtdb->common.mctx, version->glue_table, size); RWUNLOCK(&version->glue_rwlock, isc_rwlocktype_write); } static uint32_t rehash_bits(rbtdb_version_t *version, size_t newcount) { uint32_t oldbits = version->glue_table_bits; uint32_t newbits = oldbits; while (newcount >= HASHSIZE(newbits) && newbits < RBTDB_GLUE_TABLE_MAX_BITS) { newbits += 1; } return (newbits); } /*% * Write lock (version->glue_rwlock) must be held. */ static void rehash_gluetable(rbtdb_version_t *version) { uint32_t oldbits, newbits; size_t newsize, oldcount, i; rbtdb_glue_table_node_t **oldtable; oldbits = version->glue_table_bits; oldcount = HASHSIZE(oldbits); oldtable = version->glue_table; newbits = rehash_bits(version, version->glue_table_nodecount); newsize = HASHSIZE(newbits) * sizeof(version->glue_table[0]); version->glue_table = isc_mem_get(version->rbtdb->common.mctx, newsize); version->glue_table_bits = newbits; memset(version->glue_table, 0, newsize); for (i = 0; i < oldcount; i++) { rbtdb_glue_table_node_t *gluenode; rbtdb_glue_table_node_t *nextgluenode; for (gluenode = oldtable[i]; gluenode != NULL; gluenode = nextgluenode) { uint32_t hash = isc_hash32( &gluenode->node, sizeof(gluenode->node), true); uint32_t idx = hash_32(hash, newbits); nextgluenode = gluenode->next; gluenode->next = version->glue_table[idx]; version->glue_table[idx] = gluenode; } } isc_mem_put(version->rbtdb->common.mctx, oldtable, oldcount * sizeof(*version->glue_table)); isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_ZONE, ISC_LOG_DEBUG(3), "rehash_gluetable(): " "resized glue table from %zu to " "%zu", oldcount, newsize / sizeof(version->glue_table[0])); } static void maybe_rehash_gluetable(rbtdb_version_t *version) { size_t overcommit = HASHSIZE(version->glue_table_bits) * RBTDB_GLUE_TABLE_OVERCOMMIT; if (version->glue_table_nodecount < overcommit) { return; } rehash_gluetable(version); } static isc_result_t glue_nsdname_cb(void *arg, const dns_name_t *name, dns_rdatatype_t qtype, dns_rdataset_t *unused) { rbtdb_glue_additionaldata_ctx_t *ctx; isc_result_t result; dns_fixedname_t fixedname_a; dns_name_t *name_a = NULL; dns_rdataset_t rdataset_a, sigrdataset_a; dns_rbtnode_t *node_a = NULL; dns_fixedname_t fixedname_aaaa; dns_name_t *name_aaaa = NULL; dns_rdataset_t rdataset_aaaa, sigrdataset_aaaa; dns_rbtnode_t *node_aaaa = NULL; rbtdb_glue_t *glue = NULL; dns_name_t *gluename = NULL; UNUSED(unused); /* * NS records want addresses in additional records. */ INSIST(qtype == dns_rdatatype_a); ctx = (rbtdb_glue_additionaldata_ctx_t *)arg; name_a = dns_fixedname_initname(&fixedname_a); dns_rdataset_init(&rdataset_a); dns_rdataset_init(&sigrdataset_a); name_aaaa = dns_fixedname_initname(&fixedname_aaaa); dns_rdataset_init(&rdataset_aaaa); dns_rdataset_init(&sigrdataset_aaaa); result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion, dns_rdatatype_a, DNS_DBFIND_GLUEOK, 0, (dns_dbnode_t **)&node_a, name_a, &rdataset_a, &sigrdataset_a); if (result == DNS_R_GLUE) { glue = isc_mem_get(ctx->rbtdb->common.mctx, sizeof(*glue)); gluename = dns_fixedname_initname(&glue->fixedname); dns_name_copy(name_a, gluename); dns_rdataset_init(&glue->rdataset_a); dns_rdataset_init(&glue->sigrdataset_a); dns_rdataset_init(&glue->rdataset_aaaa); dns_rdataset_init(&glue->sigrdataset_aaaa); dns_rdataset_clone(&rdataset_a, &glue->rdataset_a); if (dns_rdataset_isassociated(&sigrdataset_a)) { dns_rdataset_clone(&sigrdataset_a, &glue->sigrdataset_a); } } result = zone_find((dns_db_t *)ctx->rbtdb, name, ctx->rbtversion, dns_rdatatype_aaaa, DNS_DBFIND_GLUEOK, 0, (dns_dbnode_t **)&node_aaaa, name_aaaa, &rdataset_aaaa, &sigrdataset_aaaa); if (result == DNS_R_GLUE) { if (glue == NULL) { glue = isc_mem_get(ctx->rbtdb->common.mctx, sizeof(*glue)); gluename = dns_fixedname_initname(&glue->fixedname); dns_name_copy(name_aaaa, gluename); dns_rdataset_init(&glue->rdataset_a); dns_rdataset_init(&glue->sigrdataset_a); dns_rdataset_init(&glue->rdataset_aaaa); dns_rdataset_init(&glue->sigrdataset_aaaa); } else { INSIST(node_a == node_aaaa); INSIST(dns_name_equal(name_a, name_aaaa)); } dns_rdataset_clone(&rdataset_aaaa, &glue->rdataset_aaaa); if (dns_rdataset_isassociated(&sigrdataset_aaaa)) { dns_rdataset_clone(&sigrdataset_aaaa, &glue->sigrdataset_aaaa); } } if (glue != NULL) { glue->next = ctx->glue_list; ctx->glue_list = glue; } result = ISC_R_SUCCESS; if (dns_rdataset_isassociated(&rdataset_a)) { rdataset_disassociate(&rdataset_a); } if (dns_rdataset_isassociated(&sigrdataset_a)) { rdataset_disassociate(&sigrdataset_a); } if (dns_rdataset_isassociated(&rdataset_aaaa)) { rdataset_disassociate(&rdataset_aaaa); } if (dns_rdataset_isassociated(&sigrdataset_aaaa)) { rdataset_disassociate(&sigrdataset_aaaa); } if (node_a != NULL) { detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_a); } if (node_aaaa != NULL) { detachnode((dns_db_t *)ctx->rbtdb, (dns_dbnode_t *)&node_aaaa); } return (result); } static isc_result_t rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version, dns_message_t *msg) { dns_rbtdb_t *rbtdb = rdataset->private1; dns_rbtnode_t *node = rdataset->private2; rbtdb_version_t *rbtversion = version; uint32_t idx; rbtdb_glue_table_node_t *cur; bool found = false; bool restarted = false; rbtdb_glue_t *ge; rbtdb_glue_additionaldata_ctx_t ctx; isc_result_t result; uint64_t hash; REQUIRE(rdataset->type == dns_rdatatype_ns); REQUIRE(rbtdb == rbtversion->rbtdb); REQUIRE(!IS_CACHE(rbtdb) && !IS_STUB(rbtdb)); /* * The glue table cache that forms a part of the DB version * structure is not explicitly bounded and there's no cache * cleaning. The zone data size itself is an implicit bound. * * The key into the glue hashtable is the node pointer. This is * because the glue hashtable is a property of the DB version, * and the glue is keyed for the ownername/NS tuple. We don't * bother with using an expensive dns_name_t comparison here as * the node pointer is a fixed value that won't change for a DB * version and can be compared directly. */ hash = isc_hash_function(&node, sizeof(node), true); restart: /* * First, check if we have the additional entries already cached * in the glue table. */ RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read); idx = hash_32(hash, rbtversion->glue_table_bits); for (cur = rbtversion->glue_table[idx]; cur != NULL; cur = cur->next) { if (cur->node == node) { break; } } if (cur == NULL) { goto no_glue; } /* * We found a cached result. Add it to the message and * return. */ found = true; ge = cur->glue_list; /* * (void *) -1 is a special value that means no glue is * present in the zone. */ if (ge == (void *)-1) { if (!restarted && (rbtdb->gluecachestats != NULL)) { isc_stats_increment( rbtdb->gluecachestats, dns_gluecachestatscounter_hits_absent); } goto no_glue; } else { if (!restarted && (rbtdb->gluecachestats != NULL)) { isc_stats_increment( rbtdb->gluecachestats, dns_gluecachestatscounter_hits_present); } } for (; ge != NULL; ge = ge->next) { dns_name_t *name = NULL; dns_rdataset_t *rdataset_a = NULL; dns_rdataset_t *sigrdataset_a = NULL; dns_rdataset_t *rdataset_aaaa = NULL; dns_rdataset_t *sigrdataset_aaaa = NULL; dns_name_t *gluename = dns_fixedname_name(&ge->fixedname); result = dns_message_gettempname(msg, &name); if (result != ISC_R_SUCCESS) { goto no_glue; } dns_name_copy(gluename, name); if (dns_rdataset_isassociated(&ge->rdataset_a)) { result = dns_message_gettemprdataset(msg, &rdataset_a); if (result != ISC_R_SUCCESS) { dns_message_puttempname(msg, &name); goto no_glue; } } if (dns_rdataset_isassociated(&ge->sigrdataset_a)) { result = dns_message_gettemprdataset(msg, &sigrdataset_a); if (result != ISC_R_SUCCESS) { if (rdataset_a != NULL) { dns_message_puttemprdataset( msg, &rdataset_a); } dns_message_puttempname(msg, &name); goto no_glue; } } if (dns_rdataset_isassociated(&ge->rdataset_aaaa)) { result = dns_message_gettemprdataset(msg, &rdataset_aaaa); if (result != ISC_R_SUCCESS) { dns_message_puttempname(msg, &name); if (rdataset_a != NULL) { dns_message_puttemprdataset( msg, &rdataset_a); } if (sigrdataset_a != NULL) { dns_message_puttemprdataset( msg, &sigrdataset_a); } goto no_glue; } } if (dns_rdataset_isassociated(&ge->sigrdataset_aaaa)) { result = dns_message_gettemprdataset(msg, &sigrdataset_aaaa); if (result != ISC_R_SUCCESS) { dns_message_puttempname(msg, &name); if (rdataset_a != NULL) { dns_message_puttemprdataset( msg, &rdataset_a); } if (sigrdataset_a != NULL) { dns_message_puttemprdataset( msg, &sigrdataset_a); } if (rdataset_aaaa != NULL) { dns_message_puttemprdataset( msg, &rdataset_aaaa); } goto no_glue; } } if (rdataset_a != NULL) { dns_rdataset_clone(&ge->rdataset_a, rdataset_a); ISC_LIST_APPEND(name->list, rdataset_a, link); } if (sigrdataset_a != NULL) { dns_rdataset_clone(&ge->sigrdataset_a, sigrdataset_a); ISC_LIST_APPEND(name->list, sigrdataset_a, link); } if (rdataset_aaaa != NULL) { dns_rdataset_clone(&ge->rdataset_aaaa, rdataset_aaaa); ISC_LIST_APPEND(name->list, rdataset_aaaa, link); } if (sigrdataset_aaaa != NULL) { dns_rdataset_clone(&ge->sigrdataset_aaaa, sigrdataset_aaaa); ISC_LIST_APPEND(name->list, sigrdataset_aaaa, link); } dns_message_addname(msg, name, DNS_SECTION_ADDITIONAL); } no_glue: RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_read); if (found) { return (ISC_R_SUCCESS); } if (restarted) { return (ISC_R_FAILURE); } /* * No cached glue was found in the table. Cache it and restart * this function. * * Due to the gap between the read lock and the write lock, it's * possible that we may cache a duplicate glue table entry, but * we don't care. */ ctx.glue_list = NULL; ctx.rbtdb = rbtdb; ctx.rbtversion = rbtversion; RWLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write); maybe_rehash_gluetable(rbtversion); idx = hash_32(hash, rbtversion->glue_table_bits); (void)dns_rdataset_additionaldata(rdataset, dns_rootname, glue_nsdname_cb, &ctx); cur = isc_mem_get(rbtdb->common.mctx, sizeof(*cur)); /* * XXXMUKS: it looks like the dns_dbversion is not destroyed * when named is terminated by a keyboard break. This doesn't * cleanup the node reference and keeps the process dangling. */ /* isc_refcount_increment0(&node->references); */ cur->node = node; if (ctx.glue_list == NULL) { /* * No glue was found. Cache it so. */ cur->glue_list = (void *)-1; if (rbtdb->gluecachestats != NULL) { isc_stats_increment( rbtdb->gluecachestats, dns_gluecachestatscounter_inserts_absent); } } else { cur->glue_list = ctx.glue_list; if (rbtdb->gluecachestats != NULL) { isc_stats_increment( rbtdb->gluecachestats, dns_gluecachestatscounter_inserts_present); } } cur->next = rbtversion->glue_table[idx]; rbtversion->glue_table[idx] = cur; rbtversion->glue_table_nodecount++; RWUNLOCK(&rbtversion->glue_rwlock, isc_rwlocktype_write); restarted = true; goto restart; /* UNREACHABLE */ } /*% * Routines for LRU-based cache management. */ /*% * See if a given cache entry that is being reused needs to be updated * in the LRU-list. From the LRU management point of view, this function is * expected to return true for almost all cases. When used with threads, * however, this may cause a non-negligible performance penalty because a * writer lock will have to be acquired before updating the list. * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this * function returns true if the entry has not been updated for some period of * time. We differentiate the NS or glue address case and the others since * experiments have shown that the former tends to be accessed relatively * infrequently and the cost of cache miss is higher (e.g., a missing NS records * may cause external queries at a higher level zone, involving more * transactions). * * Caller must hold the node (read or write) lock. */ static bool need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) { if (RDATASET_ATTR_GET(header, (RDATASET_ATTR_NONEXISTENT | RDATASET_ATTR_ANCIENT | RDATASET_ATTR_ZEROTTL)) != 0) { return (false); } #if DNS_RBTDB_LIMITLRUUPDATE if (header->type == dns_rdatatype_ns || (header->trust == dns_trust_glue && (header->type == dns_rdatatype_a || header->type == dns_rdatatype_aaaa))) { /* * Glue records are updated if at least DNS_RBTDB_LRUUPDATE_GLUE * seconds have passed since the previous update time. */ return (header->last_used + DNS_RBTDB_LRUUPDATE_GLUE <= now); } /* * Other records are updated if DNS_RBTDB_LRUUPDATE_REGULAR seconds * have passed. */ return (header->last_used + DNS_RBTDB_LRUUPDATE_REGULAR <= now); #else UNUSED(now); return (true); #endif /* if DNS_RBTDB_LIMITLRUUPDATE */ } /*% * Update the timestamp of a given cache entry and move it to the head * of the corresponding LRU list. * * Caller must hold the node (write) lock. * * Note that the we do NOT touch the heap here, as the TTL has not changed. */ static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now) { INSIST(IS_CACHE(rbtdb)); /* To be checked: can we really assume this? XXXMLG */ INSIST(ISC_LINK_LINKED(header, link)); ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum], header, link); header->last_used = now; ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum], header, link); } static size_t expire_lru_headers(dns_rbtdb_t *rbtdb, unsigned int locknum, size_t purgesize, bool tree_locked) { rdatasetheader_t *header, *header_prev; size_t purged = 0; for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]); header != NULL && header->last_used <= atomic_load(&rbtdb->last_used) && purged <= purgesize; header = header_prev) { header_prev = ISC_LIST_PREV(header, link); /* * Unlink the entry at this point to avoid checking it * again even if it's currently used someone else and * cannot be purged at this moment. This entry won't be * referenced any more (so unlinking is safe) since the * TTL was reset to 0. */ ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header, link); size_t header_size = rdataset_size(header); expire_header(rbtdb, header, tree_locked, expire_lru); purged += header_size; } return (purged); } /*% * Purge some stale (i.e. unused for some period - LRU based cleaning) cache * entries under the overmem condition. To recover from this condition quickly, * we cleanup entries up to the size of newly added rdata (passed as purgesize). * * The LRU lists tails are processed in LRU order to the nearest second. * * A write lock on the tree must be held. */ static void overmem_purge(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader, bool tree_locked) { uint32_t locknum_start = atomic_fetch_add(&rbtdb->lru_sweep, 1) % rbtdb->node_lock_count; uint32_t locknum = locknum_start; /* Size of added data, possible node and possible ENT node. */ size_t purgesize = rdataset_size(newheader) + 2 * dns__rbtnode_getsize(newheader->node); size_t purged = 0; isc_stdtime_t min_last_used = 0; size_t max_passes = 8; again: do { NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); purged += expire_lru_headers(rbtdb, locknum, purgesize - purged, tree_locked); /* * Work out the oldest remaining last_used values of the list * tails as we walk across the array of lru lists. */ rdatasetheader_t *header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]); if (header != NULL && (min_last_used == 0 || header->last_used < min_last_used)) { min_last_used = header->last_used; } NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); locknum = (locknum + 1) % rbtdb->node_lock_count; } while (locknum != locknum_start && purged <= purgesize); /* * Update rbtdb->last_used if we have walked all the list tails and have * not freed the required amount of memory. */ if (purged < purgesize) { if (min_last_used != 0) { atomic_store(&rbtdb->last_used, min_last_used); if (max_passes-- > 0) { goto again; } } } } static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked, expire_t reason) { set_ttl(rbtdb, header, 0); mark_header_ancient(rbtdb, header); /* * Caller must hold the node (write) lock. */ if (isc_refcount_current(&header->node->references) == 0) { /* * If no one else is using the node, we can clean it up now. * We first need to gain a new reference to the node to meet a * requirement of decrement_reference(). */ new_reference(rbtdb, header->node, isc_rwlocktype_write); decrement_reference(rbtdb, header->node, 0, isc_rwlocktype_write, tree_locked ? isc_rwlocktype_write : isc_rwlocktype_none, false); if (rbtdb->cachestats == NULL) { return; } switch (reason) { case expire_ttl: isc_stats_increment(rbtdb->cachestats, dns_cachestatscounter_deletettl); break; case expire_lru: isc_stats_increment(rbtdb->cachestats, dns_cachestatscounter_deletelru); break; default: break; } } }