版本
6.0.3
基础类型robj
// server.h
typedef struct redisObject {
unsigned type:4;
unsigned encoding:4;
unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
* LFU data (least significant 8 bits frequency
* and most significant 16 bits access time). */
int refcount;
void *ptr;
} robj;
// type:类型,有以下7种
#define OBJ_STRING 0 /* String object. */
#define OBJ_LIST 1 /* List object. */
#define OBJ_SET 2 /* Set object. */
#define OBJ_ZSET 3 /* Sorted set object. */
#define OBJ_HASH 4 /* Hash object. */
#define OBJ_MODULE 5 /* Module object. */
#define OBJ_STREAM 6 /* Stream object. */
// encoding:编码
#define OBJ_ENCODING_RAW 0 /* Raw representation */
#define OBJ_ENCODING_INT 1 /* Encoded as integer */
#define OBJ_ENCODING_HT 2 /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3 /* Encoded as zipmap */ 是个旧的表示方式,已不再用。在小于Redis 2.6的版本中才有。
#define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */ 也是个旧的表示方式,已不再用。
#define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define OBJ_ENCODING_INTSET 6 /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7 /* Encoded as skiplist */
#define OBJ_ENCODING_EMBSTR 8 /* Embedded sds string encoding */
#define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of ziplists */
#define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */
// refcount: 引用计数
// ptr:数据指针,指向真正的数据。
如字符串,type是OBJ_STRING,encoding是OBJ_ENCODING_RAW,OBJ_ENCODING_INT,OBJ_ENCODING_EMBSTR之一
type | 可能的encoding |
---|---|
OBJ_STRING | OBJ_ENCODING_INT、OBJ_ENCODING_RAW、OBJ_ENCODING_EMBSTR |
OBJ_LIST | OBJ_ENCODING_QUICKLIST |
OBJ_SET | OBJ_ENCODING_INTSET、OBJ_ENCODING_HT |
OBJ_ZSET | OBJ_ENCODING_ZIPLIST、OBJ_ENCODING_SKIPLIST |
OBJ_HASH | OBJ_ENCODING_ZIPLIST、OBJ_ENCODING_HT |
1. String
type: OBJ_STRING
encoding:OBJ_ENCODING_RAW,OBJ_ENCODING_INT,OBJ_ENCODING_EMBSTR
encoding | condition | interface |
---|---|---|
OBJ_ENCODING_INT | val可转化为long时 if (len <= 20 && string2l(s,len,&value))
|
string2ll |
OBJ_ENCODING_EMBSTR | 非long且len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT //44
|
createEmbeddedStringObject |
OBJ_ENCODING_RAW | 非以上情况时 | createRawStringObjectt_s |
redis的string实现是SDS(simple dynamic string),是作者自己定义的,而不是用C的字符串。
定义
// __attribute__ ((__packed__)) 取消字节对齐,紧凑排列。
/* Note: sdshdr5 is never used, we just access the flags byte directly.
* However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
uint8_t len; /* used */
uint8_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
uint16_t len; /* used */
uint16_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
uint32_t len; /* used */
uint32_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
uint64_t len; /* used */
uint64_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
// flags值定义
#define SDS_TYPE_5 0
#define SDS_TYPE_8 1
#define SDS_TYPE_16 2
#define SDS_TYPE_32 3
#define SDS_TYPE_64 4
- len:记录当前已使用的字节数(不包括'\0'),获取SDS长度的复杂度为O(1)
- alloc:记录当前字节数组总共分配的字节数量(不包括'\0')
- flags:标记当前字节数组的属性,是sdshdr8还是sdshdr16等
- buf:字节数组,用于保存字符串,包括结尾空白字符'\0'
创建字符串
- 算len,调用sdsnewlen
- 根据len决定用哪种类型
- 分配内存,赋值
- 计算len,alloc,flag
/* Create a new sds string starting from a null terminated C string. */
sds sdsnew(const char *init) {
size_t initlen = (init == NULL) ? 0 : strlen(init);
return sdsnewlen(init, initlen);
}
// If NULL is used for 'init' the string is initialized with zero bytes.
// If SDS_NOINIT is used, the buffer is left uninitialized;
sds sdsnewlen(const void *init, size_t initlen) {
void *sh;
sds s;
char type = sdsReqType(initlen);
/* Empty strings are usually created in order to append. Use type 8
* since type 5 is not good at this. */
if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
int hdrlen = sdsHdrSize(type);
unsigned char *fp; /* flags pointer. */
sh = s_malloc(hdrlen+initlen+1);
if (sh == NULL) return NULL;
if (init==SDS_NOINIT)
init = NULL;
else if (!init)
memset(sh, 0, hdrlen+initlen+1);
s = (char*)sh+hdrlen;
fp = ((unsigned char*)s)-1;
switch(type) {
case SDS_TYPE_5: {
*fp = type | (initlen << SDS_TYPE_BITS);
break;
}
case SDS_TYPE_8: {
SDS_HDR_VAR(8,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_16: {
SDS_HDR_VAR(16,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_32: {
SDS_HDR_VAR(32,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_64: {
SDS_HDR_VAR(64,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
}
if (initlen && init)
memcpy(s, init, initlen);
s[initlen] = '\0';
return s;
}
2、扩容
- 计算剩余可以空间,够用直接返回
- 计算扩容后所需长度newlen
- < 1M,成倍扩容
- >= 1M,每次只增加1M
- 根据newlen计算新的type
- 类型不变alloc,类型变了malloc,memcpy,free
sds sdsMakeRoomFor(sds s, size_t addlen) {
void *sh, *newsh;
size_t avail = sdsavail(s);
size_t len, newlen;
char type, oldtype = s[-1] & SDS_TYPE_MASK;
int hdrlen;
/* Return ASAP if there is enough space left. */
if (avail >= addlen) return s;
len = sdslen(s);
sh = (char*)s-sdsHdrSize(oldtype);
newlen = (len+addlen);
if (newlen < SDS_MAX_PREALLOC)
newlen *= 2;
else
newlen += SDS_MAX_PREALLOC;
type = sdsReqType(newlen);
/* Don't use type 5: the user is appending to the string and type 5 is
* not able to remember empty space, so sdsMakeRoomFor() must be called
* at every appending operation. */
if (type == SDS_TYPE_5) type = SDS_TYPE_8;
hdrlen = sdsHdrSize(type);
if (oldtype==type) {
newsh = s_realloc(sh, hdrlen+newlen+1);
if (newsh == NULL) return NULL;
s = (char*)newsh+hdrlen;
} else {
/* Since the header size changes, need to move the string forward,
* and can't use realloc */
newsh = s_malloc(hdrlen+newlen+1);
if (newsh == NULL) return NULL;
memcpy((char*)newsh+hdrlen, s, len+1);
s_free(sh);
s = (char*)newsh+hdrlen;
s[-1] = type;
sdssetlen(s, len);
}
sdssetalloc(s, newlen);
return s;
}
string set过程
- 接受客户端协议并解析,此时全部为sds,编码为OBJ_ENCODING_RAW或OBJ_ENCODING_EMBSTR。
- set首先尝试把val转为OBJ_ENCODING_INT(
tryObjectEncoding
)。 - 然后真正set
与c字符串相比的优势
- 获取字符串长度O(1)。
- 预分配内存,不用每次修改字符串长度都重新分配内存,减少了内存重分配次数。
- 用len判断字符串结束,而不是空字符,可以存储二进制数据(如bitmap)。
2、List
type: OBJ_LIST
encoding: OBJ_ENCODING_QUICKLIST
quicklist是一个双向链表,链表的每个节点是一个ziplist
127.0.0.1:6379> lpush test 111
(integer) 2
127.0.0.1:6379> OBJECT ENCODING test
"quicklist"
typedef struct quicklistNode {
struct quicklistNode *prev;
struct quicklistNode *next;
unsigned char *zl; //ziplist ptr
unsigned int sz; /* ziplist size in bytes */
unsigned int count : 16; /* count of items in ziplist */
unsigned int encoding : 2; /* RAW==1 or LZF==2 */
unsigned int container : 2; /* NONE==1 or ZIPLIST==2 */
unsigned int recompress : 1; /* was this node previous compressed? */
unsigned int attempted_compress : 1; /* node can't compress; too small */
unsigned int extra : 10; /* more bits to steal for future usage */
} quicklistNode;
[图片上传失败...(image-b3753c-1601286730477)]
ziplist简介
整体结构
/* The size of a ziplist header: two 32 bit integers for the total
* bytes count and last item offset. One 16 bit integer for the number
* of items field. */
// 两个32bit整数表示字节数和最后一个节点偏移,一个16bit整数表示节点数
#define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t))
/* Size of the "end of ziplist" entry. Just one byte. */ // ziplist结尾,占用1字节
#define ZIPLIST_END_SIZE (sizeof(uint8_t))
/* Create a new empty ziplist. */
unsigned char *ziplistNew(void) {
unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
unsigned char *zl = zmalloc(bytes); // 分配下内存
ZIPLIST_BYTES(zl) = intrev32ifbe(bytes); // 设置字节数
ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE); //设置最后一个节点偏移
ZIPLIST_LENGTH(zl) = 0; // 长度0
zl[bytes-1] = ZIP_END; // 结尾固定为255 0xFF,支持双向遍历
return zl;
}
[图片上传失败...(image-caafe1-1601286730477)]
节点结构
[图片上传失败...(image-efad29-1601286730477)]
- pre_entry_length: 前一个节点的长度
- 1 字节:如果前一节点的长度小于 254 字节,便使用一个字节保存它的值。
- 5 字节:如果前一节点的长度大于等于 254 字节,那么将第 1 个字节的值设为 254 ,然后用接下来的 4 个字节保存实际长度。
- encoding:编码格式,根据数据大小确定占用的内存,极致节约
- length:content的内存占用
- content:实际内容
[图片上传失败...(image-b9c6e5-1601286730477)]
3.2之前
ziplist or linkedlist
- ziplist:连续内存,不需要每个节点都保存前后指针,内存占用少,但插入删除慢,适合数据少时使用
- linkedlist:插入删除快。但每个节点需要保存前后指针,内存开销大。且内存不连续,容易产生内存碎片。
quicklist 优势
- 结合ziplist和linkedlist的优点。
- 双向链表前后插入删除快、单个节点是ziplist,减少内存开销,减少内存碎片。
配置参数
list-max-ziplist-size -2
我们来详细解释一下这个参数的含义。它可以取正值,也可以取负值。
当取正值的时候,表示按照数据项个数来限定每个quicklist节点上的ziplist长度。比如,当这个参数配置成5的时候,表示每个quicklist节点的ziplist最多包含5个数据项。
当取负值的时候,表示按照占用字节数来限定每个quicklist节点上的ziplist长度。这时,它只能取-1到-5这五个值,每个值含义如下:
- -5: 每个quicklist节点上的ziplist大小不能超过64 Kb。(注:1kb => 1024 bytes)
- -4: 每个quicklist节点上的ziplist大小不能超过32 Kb。
- -3: 每个quicklist节点上的ziplist大小不能超过16 Kb。
- -2: 每个quicklist节点上的ziplist大小不能超过8 Kb。(-2是Redis给出的默认值)
- -1: 每个quicklist节点上的ziplist大小不能超过4 Kb。
list-compress-depth 0
这个参数表示一个quicklist两端不被压缩的节点个数。注:这里的节点个数是指quicklist双向链表的节点个数,而不是指ziplist里面的数据项个数。实际上,一个quicklist节点上的ziplist,如果被压缩,就是整体被压缩的。
参数list-compress-depth的取值含义如下:
- 0: 是个特殊值,表示都不压缩。这是Redis的默认值。
- 1: 表示quicklist两端各有1个节点不压缩,中间的节点压缩。
- 2: 表示quicklist两端各有2个节点不压缩,中间的节点压缩。
- 依此类推…
3、Hash
OBJ_ENCODING_ZIPLIST:
哈希对象保存的所有键值对的键和值的字符串长度都小于 64 字节;
哈希对象保存的键值对数量小于 512 个;
OBJ_ENCODING_HT: 不满于以上条件时。
typedef struct dictEntry {
void *key;
union {
void *val;
uint64_t u64;
int64_t s64;
double d;
} v;
struct dictEntry *next;
} dictEntry;
typedef struct dictType { // 函数指针,为了每种类型对应自己的函数
uint64_t (*hashFunction)(const void *key); // hash 函数
void *(*keyDup)(void *privdata, const void *key); // key 复制函数
void *(*valDup)(void *privdata, const void *obj); // val 复制函数
int (*keyCompare)(void *privdata, const void *key1, const void *key2); // key 比较函数
void (*keyDestructor)(void *privdata, void *key); // key 销毁函数
void (*valDestructor)(void *privdata, void *obj); // val 销毁函数
} dictType;
/* This is our hash table structure. Every dictionary has two of this as we
* implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
dictEntry **table;
unsigned long size; // 空间,pow(2,n)
unsigned long sizemask; // size - 1,h = dictHashKey(ht, he->key) & n.sizemask; 用于确定桶的位置
unsigned long used; // 元素数
} dictht;
typedef struct dict {
dictType *type; // 函数接口
void *privdata; // 函数接口的可选参数
dictht ht[2]; // 平时只有一个ht有值,rehash过程中两个都有值
long rehashidx; /* rehashing not in progress if rehashidx == -1 */ // rehash的进度
unsigned long iterators; /* number of iterators currently running */
} dict;
/* Hash type hash table (note that small hashes are represented with ziplists) */
dictType hashDictType = {
dictSdsHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCompare, /* key compare */
dictSdsDestructor, /* key destructor */
dictSdsDestructor /* val destructor */
};
/* Reset an hashtable already initialized with ht_init().
* NOTE: This function should only called by ht_destroy(). */
static void _dictReset(dict *ht) {
ht->table = NULL;
ht->size = 0;
ht->sizemask = 0;
ht->used = 0;
}
/* Create a new hash table */
static dict *dictCreate(dictType *type, void *privDataPtr) {
dict *ht = malloc(sizeof(*ht));
_dictInit(ht,type,privDataPtr);
return ht;
}
/* Initialize the hash table */
static int _dictInit(dict *ht, dictType *type, void *privDataPtr) {
_dictReset(ht);
ht->type = type;
ht->privdata = privDataPtr;
return DICT_OK;
}
[图片上传失败...(image-930d4f-1601286730477)]
- 解决hash冲突,拉链法。
扩容与缩容
- 初始大小 4
- 扩容后size为第一个大于等于ht[0].used * 2 的 2^n
- 缩容后size为第一个大于等于 ht[0].used 的 2^n
负载因子为used/size
扩容时机
(1)负载因子>=1,且当前没有子进程在执行aof文件重写或者生成RDB文件
(2)负载因子>=5
缩容时机
负载因子<0.1,且当前没有子进程在执行aof文件重写或者生成RDB文件
为什么限制BGSAVE 命令或 BGREWRITEAOF 命令执行时尽量不扩容缩容?
大多数操作系统都采用写时复制(copy-on-write)技术来优化子进程的使用效率,尽可能地避免在子进程存在期间进行哈希表扩展操作, 这可以避免不必要的内存写入操作, 最大限度地节约内存。
这里的内存修改主要是rehash过程中节点的指针修改。
关于写时复制可参考:https://juejin.im/post/6844903702373859335
渐进式rehash
- 为h[1]分配空间,同时存在两个dictht.
- rehashidx设为0,开始rehash
- 在 rehash 进行期间, 每次对字典执行添加、删除、查找或者更新操作时, 程序除了执行指定的操作以外, 还会顺带将 ht[0] 哈希表在 rehashidx 索引上的所有键值对 rehash 到 ht[1] , 当 rehash 工作完成之后, 程序将 rehashidx 属性的值增一。
- 当h[0]所有键值都被rehash到h[1],free h[0].table, h[0]=h[1], 重置h[1], rehashidx设为-1。rehash完成。
/* Performs N steps of incremental rehashing. Returns 1 if there are still
* keys to move from the old to the new hash table, otherwise 0 is returned.
*
* Note that a rehashing step consists in moving a bucket (that may have more
* than one key as we use chaining) from the old to the new hash table, however
* since part of the hash table may be composed of empty spaces, it is not
* guaranteed that this function will rehash even a single bucket, since it
* will visit at max N*10 empty buckets in total, otherwise the amount of
* work it does would be unbound and the function may block for a long time. */
int dictRehash(dict *d, int n) {
int empty_visits = n*10; /* Max number of empty buckets to visit. */
if (!dictIsRehashing(d)) return 0;
while(n-- && d->ht[0].used != 0) {
dictEntry *de, *nextde;
/* Note that rehashidx can't overflow as we are sure there are more
* elements because ht[0].used != 0 */
assert(d->ht[0].size > (unsigned long)d->rehashidx);
while(d->ht[0].table[d->rehashidx] == NULL) {
d->rehashidx++;
if (--empty_visits == 0) return 1;
}
de = d->ht[0].table[d->rehashidx];
/* Move all the keys in this bucket from the old to the new hash HT */
while(de) {
uint64_t h;
nextde = de->next;
/* Get the index in the new hash table */
h = dictHashKey(d, de->key) & d->ht[1].sizemask;
de->next = d->ht[1].table[h];
d->ht[1].table[h] = de;
d->ht[0].used--;
d->ht[1].used++;
de = nextde;
}
d->ht[0].table[d->rehashidx] = NULL;
d->rehashidx++;
}
/* Check if we already rehashed the whole table... */
if (d->ht[0].used == 0) {
zfree(d->ht[0].table);
d->ht[0] = d->ht[1];
_dictReset(&d->ht[1]);
d->rehashidx = -1;
return 0;
}
/* More to rehash... */
return 1;
}
rehash期间操作处理
- 删除、查找、更新在两个哈希表进行。
- 添加只在h[1]进行,h[0]只减不增,慢慢变空。
4、Set
- type: OBJ_SET
- encoding: OBJ_ENCODING_HT、 OBJ_ENCODING_INTSET
都是整数且数量不超过512使用INTSET,否则使用hashtable
/* Note that these encodings are ordered, so:
* INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */
#define INTSET_ENC_INT16 (sizeof(int16_t))
#define INTSET_ENC_INT32 (sizeof(int32_t))
#define INTSET_ENC_INT64 (sizeof(int64_t))
typedef struct intset {
uint32_t encoding; // 以上三种之一,随着存储数据大小变更
uint32_t length;
int8_t contents[];
} intset;
static uint8_t _intsetValueEncoding(int64_t v) {
if (v < INT32_MIN || v > INT32_MAX)
| return INTSET_ENC_INT64;
else if (v < INT16_MIN || v > INT16_MAX)
| return INTSET_ENC_INT32;
else
| return INTSET_ENC_INT16;
}
// t_set.c
/* Add the specified value into a set.
*
* If the value was already member of the set, nothing is done and 0 is
* returned, otherwise the new element is added and 1 is returned. */
int setTypeAdd(robj *subject, sds value) {
long long llval;
if (subject->encoding == OBJ_ENCODING_HT) {
| dict *ht = subject->ptr;
| dictEntry *de = dictAddRaw(ht,value,NULL);
| if (de) {
| | dictSetKey(ht,de,sdsdup(value));
| | dictSetVal(ht,de,NULL);
| | return 1;
| }
} else if (subject->encoding == OBJ_ENCODING_INTSET) {
| if (isSdsRepresentableAsLongLong(value,&llval) == C_OK) {
| | uint8_t success = 0;
| | subject->ptr = intsetAdd(subject->ptr,llval,&success);
| | if (success) {
| | | /* Convert to regular set when the intset contains
| | | |* too many entries. */
| | | if (intsetLen(subject->ptr) > server.set_max_intset_entries)
| | | | setTypeConvert(subject,OBJ_ENCODING_HT);
| | | return 1;
| | }
| } else {
| | /* Failed to get integer from object, convert to regular set. */
| | setTypeConvert(subject,OBJ_ENCODING_HT);
| | /* The set *was* an intset and this value is not integer
| | |* encodable, so dictAdd should always work. */
| | serverAssert(dictAdd(subject->ptr,sdsdup(value),NULL) == DICT_OK);
| | return 1;
| }
} else {
| serverPanic("Unknown set encoding");
}
return 0;
}
add过程
- 当前是OBJ_ENCODING_HT,直接用hashtable的规则插入,值为NULL
- 当前是OBJ_ENCODING_INTSET,且要add的值是Int,add到set中,之后判断数量是否大于set_max_intset_entries(512),大于则转为hashtable
- 当前是OBJ_ENCODING_INTSET,且要add的值不是Int,先转换再add
intset插入过程
/* Insert an integer in the intset */
intset *intsetAdd(intset *is, int64_t value, uint8_t *success) {
uint8_t valenc = _intsetValueEncoding(value); // 计算value所需的encoding
uint32_t pos;
if (success) *success = 1; // 默认成功
/* Upgrade encoding if necessary. If we need to upgrade, we know that
|* this value should be either appended (if > 0) or prepended (if < 0),
|* because it lies outside the range of existing values. */
if (valenc > intrev32ifbe(is->encoding)) {
| /* This always succeeds, so we don't need to curry *success. */
| return intsetUpgradeAndAdd(is,value); // 提升encoding并add
} else {
| /* Abort if the value is already present in the set.
| |* This call will populate "pos" with the right position to insert
| |* the value when it cannot be found. */
| if (intsetSearch(is,value,&pos)) { // 存在返回1,pos为查到的位置. 不存在返回0,pos为要插入的位置
| | if (success) *success = 0; // 已经有了,失败
| | return is;
| }
| is = intsetResize(is,intrev32ifbe(is->length)+1); // 分配内存
| if (pos < intrev32ifbe(is->length)) intsetMoveTail(is,pos,pos+1); // 把pos的内存移到pos+1,留出空间
}
_intsetSet(is,pos,value); // 插入
is->length = intrev32ifbe(intrev32ifbe(is->length)+1); // 修改length
return is;
}
- 需要提升encoding时,先提升,再插入。
- 不需要提升encoding时,如果已存在,返回失败。不存在则扩容,memmove,插入,修改length。
5、ZSet
- type: OBJ_ZSET
- encoding: OBJ_ENCODING_ZIPLIST、OBJ_ENCODING_SKIPLIST
元素数量少于128个,所有元素小于64字节时使用ziplist,其他情况使用skiplist
skiplist
https://15721.courses.cs.cmu.edu/spring2018/papers/08-oltpindexes1/pugh-skiplists-cacm1990.pdf
* This skiplist implementation is almost a C translation of the original
* algorithm described by William Pugh in "Skip Lists: A Probabilistic
* Alternative to Balanced Trees", modified in three ways:
* a) this implementation allows for repeated scores.
* b) the comparison is not just by key (our 'score') but by satellite data.
* c) there is a back pointer, so it's a doubly linked list with the back
* pointers being only at "level 1". This allows to traverse the list
* from tail to head, useful for ZREVRANGE. */
// 基于论文Skip Lists: A Probabilistic Alternative to Balanced Trees,三点修改
// a. 支持重复score
// b. 不仅仅通过key比较,还通过其他数据
// c. 第一层有一个后退指针,可以从尾到头遍历,对ZREVRANGE很有用
#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^64 elements */
#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
/* ZSETs use a specialized version of Skiplists */
typedef struct zskiplistNode {
sds ele; // 成员数据
double score; // 分数
struct zskiplistNode *backward; // 第一层链表中指向前一个节点的指针
struct zskiplistLevel { // 层
| struct zskiplistNode *forward; // 指向下一个节点的指针
| unsigned long span; // 节点跨度,用于计算排名
} level[];
} zskiplistNode;
typedef struct zskiplist {
struct zskiplistNode *header, *tail;
unsigned long length; // 节点总数
int level; // 跳表的最大层数
} zskiplist;
typedef struct zset {
dict *dict; // 存储成员及分值,用于快速索引
zskiplist *zsl; // 跳跃表,按分值排序成员。
} zset;
[图片上传失败...(image-e010fd-1601286730477)]
总结下内存节约
减少数据存储时占用内存
- sds紧凑排列,分为四种长度,区分长短字符串使用不同存储方式。
- ziplist不需要存储前后指针,且每个节点根据数据类型使用尽量小的内存存储。
- intset结构,根据数据大小动态调整encoding。
- BGSAVE 命令或 BGREWRITEAOF 命令执行时,提高hashtable负载因子,尽量不进行rehash
连续内存,减少内存碎片
- quicklist的node为ziplist
- hash在全整数时使用intset
参考资料
- redis设计与实现
- redis6.0.3源码
- 一些技术博客
http://zhangtielei.com/posts/blog-redis-skiplist.html