/* Darts -- Double-ARray Trie System $Id: darts.h.in 1575 2007-01-27 13:24:27Z taku $; Copyright(C) 2001-2007 Taku Kudo */ #ifndef DARTS_H_ #define DARTS_H_ #define DARTS_VERSION "@VERSION@" #include #include #include #ifdef HAVE_ZLIB_H namespace zlib { #include } #define SH(p)((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8)) #define LG(p)((unsigned long)(SH(p)) |((unsigned long)(SH((p)+2)) << 16)) #endif namespace Darts { template inline T _max(T x, T y) { return(x > y) ? x : y; } template inline T* _resize(T* ptr, size_t n, size_t l, T v) { T *tmp = new T[l]; for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i]; for (size_t i = n; i < l; ++i) tmp[i] = v; delete [] ptr; return tmp; } template class Length { public: size_t operator()(const T *key) const { size_t i; for (i = 0; key[i] != (T)0; ++i) {} return i; } }; template <> class Length { public: size_t operator()(const char *key) const { return std::strlen(key); } }; template > class DoubleArrayImpl { private: struct node_t { array_u_type_ code; size_t depth; size_t left; size_t right; }; struct unit_t { array_type_ base; array_u_type_ check; }; unit_t *array_; unsigned char *used_; size_t size_; size_t alloc_size_; node_type_ **key_; size_t key_size_; size_t *length_; array_type_ *value_; size_t progress_; size_t next_check_pos_; bool no_delete_; int error_; int (*progress_func_)(size_t, size_t); size_t resize(const size_t new_size) { unit_t tmp; tmp.base = 0; tmp.check = 0; array_ = _resize(array_, alloc_size_, new_size, tmp); used_ = _resize(used_, alloc_size_, new_size, static_cast(0)); alloc_size_ = new_size; return new_size; } size_t fetch(const node_t &parent, std::vector &siblings) { if (error_ < 0) return 0; array_u_type_ prev = 0; for (size_t i = parent.left; i < parent.right; ++i) { if ((length_ ? length_[i] : length_func_()(key_[i])) < parent.depth) continue; const node_u_type_ *tmp = reinterpret_cast(key_[i]); array_u_type_ cur = 0; if ((length_ ? length_[i] : length_func_()(key_[i])) != parent.depth) cur = (array_u_type_)tmp[parent.depth] + 1; if (prev > cur) { error_ = -3; return 0; } if (cur != prev || siblings.empty()) { node_t tmp_node; tmp_node.depth = parent.depth + 1; tmp_node.code = cur; tmp_node.left = i; if (!siblings.empty()) siblings[siblings.size()-1].right = i; siblings.push_back(tmp_node); } prev = cur; } if (!siblings.empty()) siblings[siblings.size()-1].right = parent.right; return siblings.size(); } size_t insert(const std::vector &siblings) { if (error_ < 0) return 0; size_t begin = 0; size_t pos = _max((size_t)siblings[0].code + 1, next_check_pos_) - 1; size_t nonzero_num = 0; int first = 0; if (alloc_size_ <= pos) resize(pos + 1); while (true) { next: ++pos; if (alloc_size_ <= pos) resize(pos + 1); if (array_[pos].check) { ++nonzero_num; continue; } else if (!first) { next_check_pos_ = pos; first = 1; } begin = pos - siblings[0].code; if (alloc_size_ <= (begin + siblings[siblings.size()-1].code)) resize(static_cast(alloc_size_ * _max(1.05, 1.0 * key_size_ / progress_))); if (used_[begin]) continue; for (size_t i = 1; i < siblings.size(); ++i) if (array_[begin + siblings[i].code].check != 0) goto next; break; } // -- Simple heuristics -- // if the percentage of non-empty contents in check between the index // 'next_check_pos' and 'check' is greater than some constant // value(e.g. 0.9), // new 'next_check_pos' index is written by 'check'. if (1.0 * nonzero_num/(pos - next_check_pos_ + 1) >= 0.95) next_check_pos_ = pos; used_[begin] = 1; size_ = _max(size_, begin + static_cast(siblings[siblings.size() - 1].code + 1)); for (size_t i = 0; i < siblings.size(); ++i) array_[begin + siblings[i].code].check = begin; for (size_t i = 0; i < siblings.size(); ++i) { std::vector new_siblings; if (!fetch(siblings[i], new_siblings)) { array_[begin + siblings[i].code].base = value_ ? static_cast(-value_[siblings[i].left]-1) : static_cast(-siblings[i].left-1); if (value_ && (array_type_)(-value_[siblings[i].left]-1) >= 0) { error_ = -2; return 0; } ++progress_; if (progress_func_)(*progress_func_)(progress_, key_size_); } else { size_t h = insert(new_siblings); array_[begin + siblings[i].code].base = h; } } return begin; } public: typedef array_type_ value_type; typedef node_type_ key_type; typedef array_type_ result_type; // for compatibility struct result_pair_type { value_type value; size_t length; }; explicit DoubleArrayImpl(): array_(0), used_(0), size_(0), alloc_size_(0), no_delete_(0), error_(0) {} ~DoubleArrayImpl() { clear(); } void set_result(value_type& x, value_type r, size_t) { x = r; } void set_result(result_pair_type& x, value_type r, size_t l) { x.value = r; x.length = l; } void set_array(void *ptr, size_t size = 0) { clear(); array_ = reinterpret_cast(ptr); no_delete_ = true; size_ = size; } const void *array() const { return const_cast(reinterpret_cast(array_)); } void clear() { if (!no_delete_) delete [] array_; delete [] used_; array_ = 0; used_ = 0; alloc_size_ = 0; size_ = 0; no_delete_ = false; } size_t unit_size() const { return sizeof(unit_t); } size_t size() const { return size_; } size_t total_size() const { return size_ * sizeof(unit_t); } size_t nonzero_size() const { size_t result = 0; for (size_t i = 0; i < size_; ++i) if (array_[i].check) ++result; return result; } int build(size_t key_size, key_type **key, size_t *length = 0, value_type *value = 0, int (*progress_func)(size_t, size_t) = 0) { if (!key_size || !key) return 0; progress_func_ = progress_func; key_ = key; length_ = length; key_size_ = key_size; value_ = value; progress_ = 0; resize(8192); array_[0].base = 1; next_check_pos_ = 0; node_t root_node; root_node.left = 0; root_node.right = key_size; root_node.depth = 0; std::vector siblings; fetch(root_node, siblings); insert(siblings); size_ += (1 << 8 * sizeof(key_type)) + 1; if (size_ >= alloc_size_) resize(size_); delete [] used_; used_ = 0; return error_; } int open(const char *file, const char *mode = "rb", size_t offset = 0, size_t size = 0) { std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; if (std::fseek(fp, offset, SEEK_SET) != 0) return -1; if (!size) { if (std::fseek(fp, 0L, SEEK_END) != 0) return -1; size = std::ftell(fp); if (std::fseek(fp, offset, SEEK_SET) != 0) return -1; } clear(); size_ = size; size_ /= sizeof(unit_t); array_ = new unit_t[size_]; if (size_ != std::fread(reinterpret_cast(array_), sizeof(unit_t), size_, fp)) return -1; std::fclose(fp); return 0; } int save(const char *file, const char *mode = "wb", size_t offset = 0) { if (!size_) return -1; std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; if (size_ != std::fwrite(reinterpret_cast(array_), sizeof(unit_t), size_, fp)) return -1; std::fclose(fp); return 0; } #ifdef HAVE_ZLIB_H int gzopen(const char *file, const char *mode = "rb", size_t offset = 0, size_t size = 0) { std::FILE *fp = std::fopen(file, mode); if (!fp) return -1; clear(); size_ = size; if (!size_) { if (-1L != static_cast(std::fseek(fp, -8, SEEK_END))) { char buf[8]; if (std::fread(static_cast(buf), 1, 8, fp) != sizeof(buf)) { std::fclose(fp); return -1; } size_ = LG(buf+4); size_ /= sizeof(unit_t); } } std::fclose(fp); if (!size_) return -1; zlib::gzFile gzfp = zlib::gzopen(file, mode); if (!gzfp) return -1; array_ = new unit_t[size_]; if (zlib::gzseek(gzfp, offset, SEEK_SET) != 0) return -1; zlib::gzread(gzfp, reinterpret_cast(array_), sizeof(unit_t) * size_); zlib::gzclose(gzfp); return 0; } int gzsave(const char *file, const char *mode = "wb", size_t offset = 0) { zlib::gzFile gzfp = zlib::gzopen(file, mode); if (!gzfp) return -1; zlib::gzwrite(gzfp, reinterpret_cast(array_), sizeof(unit_t) * size_); zlib::gzclose(gzfp); return 0; } #endif template inline void exactMatchSearch(const key_type *key, T & result, size_t len = 0, size_t node_pos = 0) { result = exactMatchSearch (key, len, node_pos); return; } template inline T exactMatchSearch(const key_type *key, size_t len = 0, size_t node_pos = 0) { if (!len) len = length_func_()(key); T result; set_result(result, -1, 0); register array_type_ b = array_[node_pos].base; register array_u_type_ p; for (register size_t i = 0; i < len; ++i) { p = b +(node_u_type_)(key[i]) + 1; if (static_cast(b) == array_[p].check) b = array_[p].base; else return result; } p = b; array_type_ n = array_[p].base; if (static_cast(b) == array_[p].check && n < 0) set_result(result, -n-1, len); return result; } template size_t commonPrefixSearch(const key_type *key, T* result, size_t result_len, size_t len = 0, size_t node_pos = 0) { if (!len) len = length_func_()(key); register array_type_ b = array_[node_pos].base; register size_t num = 0; register array_type_ n; register array_u_type_ p; for (register size_t i = 0; i < len; ++i) { p = b; // + 0; n = array_[p].base; if ((array_u_type_) b == array_[p].check && n < 0) { // result[num] = -n-1; if (num < result_len) set_result(result[num], -n-1, i); ++num; } p = b +(node_u_type_)(key[i]) + 1; if ((array_u_type_) b == array_[p].check) b = array_[p].base; else return num; } p = b; n = array_[p].base; if ((array_u_type_)b == array_[p].check && n < 0) { if (num < result_len) set_result(result[num], -n-1, len); ++num; } return num; } value_type traverse(const key_type *key, size_t &node_pos, size_t &key_pos, size_t len = 0) { if (!len) len = length_func_()(key); register array_type_ b = array_[node_pos].base; register array_u_type_ p; for (; key_pos < len; ++key_pos) { p = b +(node_u_type_)(key[key_pos]) + 1; if (static_cast(b) == array_[p].check) { node_pos = p; b = array_[p].base; } else { return -2; // no node } } p = b; array_type_ n = array_[p].base; if (static_cast(b) == array_[p].check && n < 0) return -n-1; return -1; // found, but no value } }; #if @DARTS_DEFAULT_ARRAY_SIZE@ == @DARTS_SIZEOF_SHORT@ typedef Darts::DoubleArrayImpl DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if @DARTS_DEFAULT_ARRAY_SIZE@ == @DARTS_SIZEOF_INT@ && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if @DARTS_DEFAULT_ARRAY_SIZE@ == @DARTS_SIZEOF_LONG@ && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl DoubleArray; #define DARTS_ARRAY_SIZE_IS_DEFINED 1 #endif #if @DARTS_DEFAULT_ARRAY_SIZE@ == @DARTS_SIZEOF_LONG_LONG@ && !defined(DARTS_ARRAY_SIZE_IS_DEFINED) typedef Darts::DoubleArrayImpl DoubleArray; #endif } #endif