I put my khash.h library here. It is now compatible with VC++ 8.0. I believe it is quite stable now.
/* The MIT License Copyright (c) 2008, 2009 by attractor <attractor@live.co.uk> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { int ret, is_missing; khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); if (!ret) kh_del(32, h, k); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); k = kh_get(32, h, 5); kh_del(32, h, k); for (k = kh_begin(h); k != kh_end(h); ++k) if (kh_exist(h, k)) kh_value(h, k) = 1; kh_destroy(32, h); return 0; } */ /* 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): * Corrected the example * Improved interfaces 2008-09-11 (0.2.2): * Improved speed a little in kh_put() 2008-09-10 (0.2.1): * Added kh_clear() * Fixed a compiling error 2008-09-02 (0.2.0): * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): * Added destructor */ #ifndef __AC_KHASH_H #define __AC_KHASH_H /*! @header Generic hash table library. @copyright Heng Li */ #define AC_VERSION_KHASH_H "0.2.4" #include <stdlib.h> #include <string.h> #include <limits.h> /* compipler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX typedef unsigned long khint64_t; #else typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER #define inline __inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_HASH_PRIME_SIZE 32 static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = { 0ul, 3ul, 11ul, 23ul, 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul }; #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) static const double __ac_HASH_UPPER = 0.77; #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; \ static inline kh_##name##_t *kh_init_##name() { \ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ static inline void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ free(h->keys); free(h->flags); \ free(h->vals); \ free(h); \ } \ } \ static inline void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last; \ k = __hash_func(key); i = k % h->n_buckets; \ inc = 1 + k % (h->n_buckets - 1); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ else i += inc; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ khint_t t = __ac_HASH_PRIME_SIZE - 1; \ while (__ac_prime_list[t] > new_n_buckets) --t; \ new_n_buckets = __ac_prime_list[t+1]; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ else { \ new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ } \ } \ if (j) { \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ while (1) { \ khint_t inc, k, i; \ k = __hash_func(key); \ i = k % new_n_buckets; \ inc = 1 + k % (new_n_buckets - 1); \ while (!__ac_isempty(new_flags, i)) { \ if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ else i += inc; \ } \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ __ac_set_isdel_true(h->flags, i); \ } else { \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ } \ } \ } \ } \ if (h->n_buckets > new_n_buckets) { \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ free(h->flags); \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { \ if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ else kh_resize_##name(h, h->n_buckets + 1); \ } \ { \ khint_t inc, k, i, site, last; \ x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ if (__ac_isempty(h->flags, i)) x = i; \ else { \ inc = 1 + k % (h->n_buckets - 1); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ else i += inc; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ else x = i; \ } \ } \ } \ if (__ac_isempty(h->flags, x)) { \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ } else if (__ac_isdel(h->flags, x)) { \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ } else *ret = 0; \ return x; \ } \ static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ } /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function @param key The integer [khint32_t] @return The hash value [khint_t] */ #define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function @param key The integer [khint64_t] @return The hash value [khint_t] */ #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ static inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = *s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; return h; } /*! @function @abstract Another interface to const char* hash function @param key Pointer to a null terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) /*! @function @abstract Const char* comparison function */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) /* --- END OF HASH FUNCTIONS --- */ /* Other necessary macros... */ /*! @abstract Type of the hash table. @param name Name of the hash table [symbol] */ #define khash_t(name) kh_##name##_t /*! @function @abstract Initiate a hash table. @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ #define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_destroy(name, h) kh_destroy_##name(h) /*! @function @abstract Reset a hash table without deallocating memory. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_clear(name, h) kh_clear_##name(h) /*! @function @abstract Resize a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param s New size [khint_t] */ #define kh_resize(name, h, s) kh_resize_##name(h, s) /*! @function @abstract Insert a key to the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) /*! @function @abstract Remove a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Iterator to the element to be deleted [khint_t] */ #define kh_del(name, h, k) kh_del_##name(h, k) /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return 1 if containing data; 0 otherwise [int] */ #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) /*! @function @abstract Get key given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Key [type of keys] */ #define kh_key(h, x) ((h)->keys[x]) /*! @function @abstract Get value given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Value [type of values] @discussion For hash sets, calling this results in segfault. */ #define kh_val(h, x) ((h)->vals[x]) /*! @function @abstract Alias of kh_val() */ #define kh_value(h, x) ((h)->vals[x]) /*! @function @abstract Get the start iterator @param h Pointer to the hash table [khash_t(name)*] @return The start iterator [khint_t] */ #define kh_begin(h) (khint_t)(0) /*! @function @abstract Get the end iterator @param h Pointer to the hash table [khash_t(name)*] @return The end iterator [khint_t] */ #define kh_end(h) ((h)->n_buckets) /*! @function @abstract Get the number of elements in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of elements in the hash table [khint_t] */ #define kh_size(h) ((h)->size) /*! @function @abstract Get the number of buckets in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of buckets in the hash table [khint_t] */ #define kh_n_buckets(h) ((h)->n_buckets) /* More conenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */
Here is a small example:
#include <stdio.h> #include <assert.h> #include <time.h> #include <stdlib.h> #include <string.h> #include "khash.h" KHASH_SET_INIT_STR(str) KHASH_SET_INIT_INT(int) static int data_size = 5000000; static unsigned *int_data; static char **str_data; void ht_init_data() { int i; char buf[256]; khint32_t x = 11; printf("--- generating data... "); int_data = (unsigned*)calloc(data_size, sizeof(unsigned)); str_data = (char**)calloc(data_size, sizeof(char*)); for (i = 0; i < data_size; ++i) { int_data[i] = (unsigned)(data_size * ((double)x / UINT_MAX) / 4) * 271828183u; sprintf(buf, "%x", int_data[i]); str_data[i] = strdup(buf); x = 1664525L * x + 1013904223L; } printf("done!\n"); } void ht_destroy_data() { int i; for (i = 0; i < data_size; ++i) free(str_data[i]); free(str_data); free(int_data); } void ht_khash_int() { int i, ret; unsigned *data = int_data; khash_t(int) *h; unsigned k; h = kh_init(int); for (i = 0; i < data_size; ++i) { k = kh_put(int, h, data[i], &ret); if (!ret) kh_del(int, h, k); } printf("[ht_khash_int] size: %u\n", kh_size(h)); kh_destroy(int, h); } void ht_khash_str() { int i, ret; char **data = str_data; khash_t(str) *h; unsigned k; h = kh_init(str); for (i = 0; i < data_size; ++i) { k = kh_put(str, h, data[i], &ret); if (!ret) kh_del(str, h, k); } printf("[ht_khash_int] size: %u\n", kh_size(h)); kh_destroy(str, h); } void ht_timing(void (*f)(void)) { clock_t t = clock(); (*f)(); printf("[ht_timing] %.3lf sec\n", (double)(clock() - t) / CLOCKS_PER_SEC); } int main(int argc, char *argv[]) { if (argc > 1) data_size = atoi(argv[1]); ht_init_data(); ht_timing(ht_khash_int); ht_timing(ht_khash_str); ht_destroy_data(); return 0; }
[…] https://attractivechaos.wordpress.com/2009/09/29/khash-h/ Leave a Comment […]
Hello Chaos,
Using khash, I created a hash table and putted a large bunch of key-value pairs into it (keys are of cstring type). However it not worked as what expected. All the keys returned by get_key are same. What’s the matter? I have mailed you for help, do you receive it?
Here is my code:
#include
#include
#include
#include “khash.h”
KHASH_MAP_INIT_STR(str, int); /* key is string, value is int*/ int main() {
double x = 1.53;
int i;
khash_t(str) *h;
h = kh_init(str);
unsigned int iter; /* a iterator of a element (a key-value pair)in hash table*/
int ret; /* a return value of kh_put operation, 0 means a key already exist */
char key[256]; /* key [type of c string]*/
for(i=0; i < 100; i ) {
x = 0.01;
sprintf(key, "%.5f", x);
iter = kh_put(str, h, key,
The hash table keeps the point of a string, but not the content of the string. When you insert str, the hash table keeps the address of str everywhere and the content is the same. What you should do is to put strdup(str). But remember to traverse the hash table to free the memory allocated by strdup() before you destroy the hash table.
Currently many links on your pages and on blog point to khash.h in version 0.2.3, while ONLY place I was able to find version 0.2.4 is this post.
For example places to correct (update) version are:
http://attractivechaos.awardspace.com/khash.h.html
http://attractivechaos.awardspace.com/
The most recent version has been moved here:
https://github.com/attractivechaos/klib
Hi Chaos,
In your e.g., after the put operation:
k = kh_put(str, h, data[i], &ret);
you added the line below:
if (!ret) kh_del(str, h, k);
to delete k if ret returns 0 (implying the key is already present in the hash table).
I see in your another post that ‘For … khash.h, one operation is needed to achieve “insert if absent; delete otherwise”.’
But I wonder why you need to do so.
Is it just a design of the example/benchmark?
Also, what is the behavior of kh_put when it found the hash entry is not empty, is it kh_put will return the existing entry’s key and no overwriting of the old entry with the new?
I also wonder how to put value to the hash entry?
Is it after calling the put, then followed by
kh_value(h, k) = some pointer;
Thanks a lot.
kh_put() will not overwrite the key if the old_key “equals” the key you want to insert. If you want to change the key, you may modify the key with “kh_key(h,k)=new_key”. Note that in this case, the new key must “equal” the old key (for C strings, the content is identical). If you want to change value, “kh_value(h,k)=val”.
Hi Chaotic Attractor!
I wonder: Why do you make a distinction between empty and deleted entries?
I see that kh_del() decrements size but not n_occupied while also kh_put() increments only size for a previously deleted entry. That means deleted entries still count as “occupied” but can be reused later. What’s the idea behind this?
And will a kh_resize() make all deleted entries empty?
Thanks!
Duh! I found out why!
khash is using linear hashing. If an entry is already occupied rehash and use a different index. But if one deletes the first entry, this is a special case: the second entry must still be found.
That’s why there’s the difference between deleted and empty entries. Deleted entries are not the end of the bucket.
can any body post a working example with integer as key and data
In the interest of not reinventing the wheel, is there a version of khash.h that has store and load functionality?
The first function would save a binary image of the hash+data to disk and the second would memory map the resulting file. Load can pretty safely be a memory map because if the saved structure will not fit into memory, it wouldn’t have fit during a normal program run either. Alternatively, to use less space, one might store the file offset to each entry with the hash, rather than the entry’s data. For most purposes the binary form of the hash+data stored on disk need not be portable to other architectures.
This would be useful in situations like this one:
https://gist.github.com/1168330/363558e3147afffcee0096e84a94aabfaf2e7e20#file-fqextract-c
where a large data set must be traversed and all entries hashed each time a subset is to be extracted. When large numbers of different subsets are to be extracted from a static data set that results in a lot of repeated computation and I/O. The current version of that file would be fastest for a single extraction, and the save/load one faster for all subsequent runs.
Here is an alternative hashmap implementation in C that is a mature library with unit tests and a more natural type-safe generic API.
https://github.com/DavidLeeds/hashmap
[…] https://attractivechaos.wordpress.com/2009/09/29/khash-h/ […]
[…] khash.h […]
[…] https://attractivechaos.wordpress.com/2009/09/29/khash-h/ […]
[…] Well if you know the basics behind them, it shouldn’t be too hard.Generally you create an array called “buckets” that contain the key and value, with an optional pointer to create a linked list.When you access the hash table with a key, you process the key with a custom hash function which will return an integer. You then take the modulus of the result and that is the location of your array index or “bucket”. Then you check the unhashed key with the stored key, and if it matches, then you found the right place.Otherwise, you’ve had a “collision” and must crawl through the linked list and compare keys until you match. (note some implementations use a binary tree instead of linked list for collisions).Check out this fast hash table implementation:https://attractivechaos.wordpress.com/2009/09/29/khash-h/ […]
[…] khash.h […]
[…] https://attractivechaos.wordpress.com/2009/09/29/khash-h/ […]
[…] khash.h […]
[…] https://attractivechaos.wordpress.com/2009/09/29/khash-h/ […]