1. Bitmap 原理 1.1. 基本概念 Bitmap(位图)是一种使用二进制位(bit)来存储数据的高效数据结构。每个 bit 只能表示 0 或 1,可以用来表示某个元素是否存在。
1.2. 核心功能 与Set类似,天然具备去重功能,但存储效率很高,更省内存。
1.3. 核心思想
用一个 bit 数组来存储数据
每个 bit 的位置对应一个元素(如数字、ID等)
bit 值为 1 表示元素存在,0 表示不存在
1.4. 存储效率 相比传统的Set数据结构,Bitmap 可以极大地节省存储空间:
存储 100 万个32位整数:传统需要 4MB,Bitmap 只需要约 125KB,压缩率为 32 倍。
存储 1 亿个整数:传统需要 400MB,Bitmap 只需要约 12.5MB,压缩率为 32 倍。
2. Bitmap 代码实现 2.1. 基础版本 Bitmap类:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 #include <cmath> #include <cstdint> #include <stdexcept> #include <string> #include <vector> class Bitmap { private : std::vector<uint8_t > bits_; size_t size_; public : Bitmap (size_t size) : size_ (size) { bits_.resize (std::ceil (size / 8.0 ), 0 ); } ~Bitmap () = default ; void set (size_t position) { if (position >= size_) { throw std::out_of_range ("Position " + std::to_string (position) + " out of range" ); } size_t byteIndex = position >> 3 ; size_t bitIndex = position & 0x7 ; bits_[byteIndex] |= (1 << bitIndex); } bool get (size_t position) const { if (position >= size_) { throw std::out_of_range ("Position " + std::to_string (position) + " out of range" ); } size_t byteIndex = position >> 3 ; size_t bitIndex = position & 0x7 ; return (bits_[byteIndex] & (1 << bitIndex)) != 0 ; } bool operator [](size_t position) const { return get (position); } void reset (size_t position) { if (position >= size_) { throw std::out_of_range ("Position " + std::to_string (position) + " out of range" ); } size_t byteIndex = position >> 3 ; size_t bitIndex = position & 0x7 ; bits_[byteIndex] &= ~(1 << bitIndex); } size_t count () { size_t count = 0 ; for (size_t i = 0 ; i < bits_.size (); i++) { uint8_t byte = bits_[i] & 0xff ; for (size_t j = 0 ; j < 8 ; j++) { if ((byte & (1 << j)) != 0 ) { count++; } } } return count; } size_t size () const { return size_; } Bitmap operator &(const Bitmap& other) const { if (this ->size_ != other.size_) { throw std::invalid_argument ("Bitmaps must have same size" ); } Bitmap result (size_) ; for (size_t i = 0 ; i < bits_.size (); i++) { result.bits_[i] = bits_[i] & other.bits_[i]; } return result; } Bitmap operator |(const Bitmap& other) const { if (this ->size_ != other.size_) { throw std::invalid_argument ("Bitmaps must have same size" ); } Bitmap result (size_) ; for (size_t i = 0 ; i < bits_.size (); i++) { result.bits_[i] = bits_[i] | other.bits_[i]; } return result; } bool operator ==(const Bitmap& other) const { if (size_ != other.size_) return false ; for (size_t i = 0 ; i < bits_.size (); i++) { if (bits_[i] != other.bits_[i]) return false ; } return true ; } };
使用示例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 #include <iostream> void test_bitmap () { std::cout << "=== 基础 Bitmap 示例 ===" << std::endl; Bitmap bitmap1 (100 ) ; bitmap1. set (10 ); bitmap1. set (20 ); bitmap1. set (99 ); std::cout << "位置 10: " << bitmap1. get (10 ) << std::endl; std::cout << "位置 10: " << bitmap1. get (10 ) << std::endl; std::cout << "位置 25: " << bitmap1[25 ] << std::endl; std::cout << "元素数量: " << bitmap1. count () << std::endl; bitmap1. set (99 ); std::cout << "元素数量: " << bitmap1. count () << std::endl; bitmap1. reset (99 ); std::cout << "元素数量: " << bitmap1. count () << std::endl; Bitmap bitmap2 (100 ) ; bitmap2. set (20 ); bitmap2. set (30 ); bitmap2. set (40 ); auto andResult = bitmap1 & bitmap2; std::cout << "bitmap1 & bitmap2 : " << andResult.count () << std::endl; auto orResult = bitmap1 | bitmap2; std::cout << "bitmap1 | bitmap2 : " << orResult.count () << std::endl; }
运行结果:
1 2 3 4 5 6 7 8 9 === 基础 Bitmap 示例 === 位置 10: 1 位置 10: 1 位置 25: 0 元素数量: 3 元素数量: 3 元素数量: 2 bitmap1 & bitmap2 : 1 bitmap1 | bitmap2 : 4
2.2. 动态扩容 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 #include <cmath> #include <cstdint> #include <stdexcept> #include <string> #include <vector> class DynamicBitmap { private : std::vector<uint8_t> bits_; size_t size_; public : DynamicBitmap(size_t init_size = 64 ) : size_(init_size) { bits_.resize(std::ceil(size_ / 8.0 ), 0 ); } ~DynamicBitmap() = default ; void set (size_t position) { ensureCapacity(position + 1 ); size_t byteIndex = position >> 3 ; size_t bitIndex = position & 0x7 ; bits_[byteIndex] |= (1 << bitIndex); } void ensureCapacity (size_t minSize) { if (minSize <= size_) return ; size_ = std::max(size_ * 2 , minSize); bits_.resize(size_); } };
2.3. 压缩位图 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 #include <cmath> #include <cstdint> #include <stdexcept> #include <string> #include <vector> class RoaringBitmap { private : size_t block_size_{ 0 }; std::unordered_map<size_t , Bitmap> container_; public : RoaringBitmap (size_t blockSize) : block_size_ (blockSize) { } ~RoaringBitmap () = default ; public : size_t block_size () const { return block_size_; } void set (size_t position) { size_t key = position / block_size_; size_t bitPosition = position % block_size_; if (!container_.count (key)) { container_.emplace (key, Bitmap (block_size_)); } container_.at (key).set (bitPosition); } bool get (size_t position) const { size_t key = position / block_size_; size_t bitPosition = position % block_size_; auto itr = container_.find (key); if (itr != container_.end ()) { return false ; } return itr->second.get (bitPosition); } bool operator [](size_t position) const { return get (position); } void reset (size_t position) { size_t key = position / block_size_; size_t bitPosition = position % block_size_; auto itr = container_.find (key); if (itr != container_.end ()) { throw std::out_of_range ("Position " + std::to_string (position) + " not in container" ); } itr->second.reset (position); } size_t count () { size_t count = 0 ; for (auto itr = container_.begin (); itr != container_.end (); itr++) { count += itr->second.count (); } return count; } size_t size () const { size_t size = 0 ; for (auto itr = container_.begin (); itr != container_.end (); itr++) { size += itr->second.size (); } return size; } RoaringBitmap operator &(const RoaringBitmap& other) const { if (block_size () != other.block_size ()) { throw std::invalid_argument ("RoaringBitmap must have same block_size" ); } if (container_.size () != other.container_.size ()) { throw std::invalid_argument ("RoaringBitmap must have same size" ); } RoaringBitmap rBitmap (block_size_) ; for (auto itr = container_.begin (); itr != container_.end (); itr++) { auto & key = itr->first; auto & val = itr->second; if (other.container_.count (key)) { Bitmap result = val & other.container_.at (key); rBitmap.container_.emplace (key, result); } } return rBitmap; } RoaringBitmap operator |(const RoaringBitmap& other) const { if (block_size () != other.block_size ()) { throw std::invalid_argument ("RoaringBitmap must have same block_size" ); } if (container_.size () != other.container_.size ()) { throw std::invalid_argument ("RoaringBitmap must have same size" ); } RoaringBitmap rBitmap (block_size_) ; for (auto itr = container_.begin (); itr != container_.end (); itr++) { auto & key = itr->first; auto & val = itr->second; if (other.container_.count (key)) { Bitmap result = val | other.container_.at (key); rBitmap.container_.emplace (key, result); } else { rBitmap.container_.emplace (key, val); } } for (auto itr = other.container_.begin (); itr != other.container_.end (); itr++) { auto & key = itr->first; auto & val = itr->second; if (!rBitmap.container_.count (key)) { rBitmap.container_.emplace (key, val); } } return rBitmap; } bool operator ==(const RoaringBitmap& other) const { if (block_size () != other.block_size ()) { return false ; } if (container_.size () != other.container_.size ()) { return false ; } for (auto itr = container_.begin (); itr != container_.end (); itr++) { auto & key = itr->first; auto & val = itr->second; if (!other.container_.count (key)) { return false ; } if (val != other.container_.at (key)) { return false ; } } return true ; } };
2.4. 对比分析 对比分析:
特性维度
基础 Bitmap
动态扩容 Bitmap
RoaringBitmap
内存分配
固定大小,初始化时确定
动态调整,按需扩容
分块存储,稀疏数据压缩
内存效率
数据密集时高效
数据密集时高效,但有扩容开销
稀疏数据时极高,密集数据接近基础版
存储密度
始终 O(n)
始终 O(n)
稀疏时 O(k),密集时 O(n)
访问性能
O(1) 直接寻址
O(1),扩容时可能短暂阻塞
O(1) 或 O(log k),k为容器数
集合操作
极快,位运算
极快,位运算
快速,容器级并行
适用数据分布
密集、连续数据
数据量不确定但分布密集
稀疏、不均匀数据
典型应用场景
用户ID状态管理、固定范围标记
动态用户系统、渐进式数据收集
大规模稀疏数据、数据仓库查询
选择建议:
数据密集且范围已知 → 基础 Bitmap
数据密集但范围未知 → 动态扩容 Bitmap
数据稀疏或分布不均 → RoaringBitmap
需要高效集合运算 → 三者均可,RoaringBitmap在大数据量时更优
3. Bitmap 应用场景 3.1. 4.1 大数据去重 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 #include <memory> class DeduplicationService {private : std::unique_ptr<Bitmap> bitmap_; public : DeduplicationService (size_t maxUserId) { bitmap_ = std::make_unique <Bitmap>(maxUserId); } bool isProcessed (size_t userId) { return bitmap_->get (userId); } void markProcessed (size_t userId) { bitmap_->set (userId); } size_t getProcessedCount () { return bitmap_->count (); } };
3.2. 4.2 用户标签系统 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 #include <unordered_map> #include <vector> #include <string> #include <memory> class UserTagSystem {private : std::unordered_map<std::string, std::unique_ptr<Bitmap>> tagBitmaps_; size_t totalUsers_; public : UserTagSystem (size_t totalUsers) : totalUsers_ (totalUsers) {} void addTag (size_t userId, const std::string& tag) { auto it = tagBitmaps_.find (tag); if (it == tagBitmaps_.end ()) { tagBitmaps_[tag] = std::make_unique <Bitmap>(totalUsers_); } tagBitmaps_[tag]->set (userId); } void removeTag (size_t userId, const std::string& tag) { auto it = tagBitmaps_.find (tag); if (it != tagBitmaps_.end ()) { it->second->reset (userId); } } std::vector<size_t > getUsersWithTag (const std::string& tag) { std::vector<size_t > users; auto it = tagBitmaps_.find (tag); if (it != tagBitmaps_.end ()) { Bitmap* bitmap = it->second.get (); for (size_t i = 0 ; i < totalUsers_; i++) { if (bitmap->get (i)) { users.push_back (i); } } } return users; } std::vector<size_t > getUsersWithTags (const std::vector<std::string>& tags) { if (tags.empty ()) return std::vector <size_t >(); auto firstIt = tagBitmaps_.find (tags[0 ]); if (firstIt == tagBitmaps_.end ()) { return std::vector <size_t >(); } std::unique_ptr<Bitmap> result = std::make_unique <Bitmap>(*(firstIt->second)); for (size_t i = 1 ; i < tags.size (); i++) { auto it = tagBitmaps_.find (tags[i]); if (it == tagBitmaps_.end ()) { return std::vector <size_t >(); } Bitmap intersection = *result & *(it->second); result = std::make_unique <Bitmap>(intersection); if (result->count () == 0 ) { return std::vector <size_t >(); } } std::vector<size_t > users; for (size_t i = 0 ; i < totalUsers_; i++) { if (result->get (i)) { users.push_back (i); } } return users; } std::vector<std::string> getUserTags (size_t userId) { std::vector<std::string> userTags; for (const auto & pair : tagBitmaps_) { if (pair.second->get (userId)) { userTags.push_back (pair.first); } } return userTags; } size_t getTagCount () const { return tagBitmaps_.size (); } bool hasTag (const std::string& tag) const { return tagBitmaps_.find (tag) != tagBitmaps_.end (); } };
3.3. 4.3 布隆过滤器(Bloom Filter) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 #include <unordered_map> #include <vector> #include <string> #include <memory> class UserTagSystem {private : std::unordered_map<std::string, std::unique_ptr<Bitmap>> tagBitmaps_; size_t totalUsers_; public : UserTagSystem (size_t totalUsers) : totalUsers_ (totalUsers) {} void addTag (size_t userId, const std::string& tag) { auto it = tagBitmaps_.find (tag); if (it == tagBitmaps_.end ()) { tagBitmaps_[tag] = std::make_unique <Bitmap>(totalUsers_); } tagBitmaps_[tag]->set (userId); } void removeTag (size_t userId, const std::string& tag) { auto it = tagBitmaps_.find (tag); if (it != tagBitmaps_.end ()) { it->second->reset (userId); } } std::vector<size_t > getUsersWithTag (const std::string& tag) { std::vector<size_t > users; auto it = tagBitmaps_.find (tag); if (it != tagBitmaps_.end ()) { Bitmap* bitmap = it->second.get (); for (size_t i = 0 ; i < totalUsers_; i++) { if (bitmap->get (i)) { users.push_back (i); } } } return users; } std::vector<size_t > getUsersWithTags (const std::vector<std::string>& tags) { if (tags.empty ()) return std::vector <size_t >(); auto firstIt = tagBitmaps_.find (tags[0 ]); if (firstIt == tagBitmaps_.end ()) { return std::vector <size_t >(); } std::unique_ptr<Bitmap> result = std::make_unique <Bitmap>(*(firstIt->second)); for (size_t i = 1 ; i < tags.size (); i++) { auto it = tagBitmaps_.find (tags[i]); if (it == tagBitmaps_.end ()) { return std::vector <size_t >(); } Bitmap intersection = *result & *(it->second); result = std::make_unique <Bitmap>(intersection); if (result->count () == 0 ) { return std::vector <size_t >(); } } std::vector<size_t > users; for (size_t i = 0 ; i < totalUsers_; i++) { if (result->get (i)) { users.push_back (i); } } return users; } std::vector<std::string> getUserTags (size_t userId) { std::vector<std::string> userTags; for (const auto & pair : tagBitmaps_) { if (pair.second->get (userId)) { userTags.push_back (pair.first); } } return userTags; } size_t getTagCount () const { return tagBitmaps_.size (); } bool hasTag (const std::string& tag) const { return tagBitmaps_.find (tag) != tagBitmaps_.end (); } };
4. 总结 Bitmap 是一种极其高效的数据结构,特别适合以下场景:
海量数据去重 :用户ID、订单号等去重
标签系统 :用户画像、商品标签等
集合运算 :交集、并集、差集计算
布隆过滤器 :快速判断元素是否存在
数据库索引 :加速查询操作
优点 :
缺点 :
一般只支持整数类型
数据稀疏时浪费空间
不支持删除操作(需要特殊处理)
数据范围较大时需要较多内存
在实际应用中,可以根据具体需求选择合适的 Bitmap 变种,如 Roaring Bitmap 用于处理稀疏数据,压缩位图用于优化存储空间等。