Bitmap算法

1. Bitmap 原理
2. Bitmap 代码实现
3. Bitmap 应用场景
4. 总结

1. Bitmap 原理

1.1. 基本概念

Bitmap（位图）是一种使用二进制位（bit）来存储数据的高效数据结构。每个 bit 只能表示 0 或 1，可以用来表示某个元素是否存在。

1.2. 核心功能

与Set类似，天然具备去重功能，但存储效率很高，更省内存。

1.3. 核心思想

用一个 bit 数组来存储数据
每个 bit 的位置对应一个元素（如数字、ID等）
bit 值为 1 表示元素存在，0 表示不存在

1.4. 存储效率

相比传统的Set数据结构，Bitmap 可以极大地节省存储空间：

存储 100 万个32位整数：传统需要 4MB，Bitmap 只需要约 125KB，压缩率为 32 倍。
存储 1 亿个整数：传统需要 400MB，Bitmap 只需要约 12.5MB，压缩率为 32 倍。

2. Bitmap 代码实现

2.1. 基础版本

Bitmap类：

#include <cmath>
#include <cstdint>
#include <stdexcept>
#include <string>
#include <vector>

class Bitmap
{
private:
    std::vector<uint8_t> bits_;
    size_t size_;

public:
    Bitmap(size_t size)
      : size_(size)
    {
        // 计算需要的字节数，向上取整: size/8
        // 注意：这里必须除以8.0，才能获取浮点数然后向上取整
        bits_.resize(std::ceil(size / 8.0), 0);
    }

    ~Bitmap() = default;

    /**
     * 设置指定位置为 1
     */
    void set(size_t position)
    {
        if (position >= size_)
        {
            throw std::out_of_range("Position " + std::to_string(position) + " out of range");
        }

        size_t byteIndex = position >> 3; // 等价于 position / 8
        size_t bitIndex = position & 0x7; // 等价于 position % 8
        bits_[byteIndex] |= (1 << bitIndex);
    }

    /**
     * 获取指定位置的值
     */
    bool get(size_t position) const
    {
        if (position >= size_)
        {
            throw std::out_of_range("Position " + std::to_string(position) + " out of range");
        }
        size_t byteIndex = position >> 3; // 等价于 position / 8
        size_t bitIndex = position & 0x7; // 等价于 position % 8
        return (bits_[byteIndex] & (1 << bitIndex)) != 0;
    }

    /**
     * 重载 [] 操作符
     */
    bool operator[](size_t position) const { return get(position); }

    /**
     * 设置指定位置为 0
     */
    void reset(size_t position)
    {
        if (position >= size_)
        {
            throw std::out_of_range("Position " + std::to_string(position) + " out of range");
        }
        size_t byteIndex = position >> 3;     // 等价于 position / 8
        size_t bitIndex = position & 0x7;     // 等价于 position % 8
        bits_[byteIndex] &= ~(1 << bitIndex); // 对应的bit设置为0
    }

    /**
     * @brief 获取数值为1的位数
     *
     * @return size_t
     */
    size_t count()
    {
        size_t count = 0;
        for (size_t i = 0; i < bits_.size(); i++)
        {
            uint8_t byte = bits_[i] & 0xff;
            for (size_t j = 0; j < 8; j++)
            {
                // 方法一
                if ((byte & (1 << j)) != 0)
                {
                    count++;
                }
                // 方法二
                // if (((byte >>> j) & 0x01) == 1)
                // {
                //     count ++;
                // }
            }
        }
        return count;
    }

    /**
     * 获取 Bitmap 的大小（bit 数）
     */
    size_t size() const { return size_; }

    /**
     * 与另一个 Bitmap 进行 AND 操作
     */
    Bitmap operator&(const Bitmap& other) const
    {
        if (this->size_ != other.size_)
        {
            throw std::invalid_argument("Bitmaps must have same size");
        }
        Bitmap result(size_);
        for (size_t i = 0; i < bits_.size(); i++)
        {
            result.bits_[i] = bits_[i] & other.bits_[i];
        }
        return result;
    }

    /**
     * 与另一个 Bitmap 进行 OR 操作
     */
    Bitmap operator|(const Bitmap& other) const
    {
        if (this->size_ != other.size_)
        {
            throw std::invalid_argument("Bitmaps must have same size");
        }
        Bitmap result(size_);
        for (size_t i = 0; i < bits_.size(); i++)
        {
            result.bits_[i] = bits_[i] | other.bits_[i];
        }
        return result;
    }

    /**
     * 判断是否相等
     */
    bool operator==(const Bitmap& other) const
    {
        if (size_ != other.size_)
            return false;
        for (size_t i = 0; i < bits_.size(); i++)
        {
            if (bits_[i] != other.bits_[i])
                return false;
        }
        return true;
    }
};

使用示例：

#include <iostream>

void test_bitmap()
{
    std::cout << "=== 基础 Bitmap 示例 ===" << std::endl;
    Bitmap bitmap1(100);
    bitmap1.set(10);
    bitmap1.set(20);
    bitmap1.set(99);

    std::cout << "位置 10: " << bitmap1.get(10) << std::endl;
    std::cout << "位置 10: " << bitmap1.get(10) << std::endl;
    std::cout << "位置 25: " << bitmap1[25] << std::endl;
    std::cout << "元素数量: " << bitmap1.count() << std::endl;
    bitmap1.set(99); // 添加重复元素，数量保持不变
    std::cout << "元素数量: " << bitmap1.count() << std::endl;
    bitmap1.reset(99); // 重置位置20的只为0，数量减1
    std::cout << "元素数量: " << bitmap1.count() << std::endl;

    // 逻辑位运算
    Bitmap bitmap2(100);
    bitmap2.set(20);
    bitmap2.set(30);
    bitmap2.set(40);
    auto andResult = bitmap1 & bitmap2;
    std::cout << "bitmap1 & bitmap2 : " << andResult.count() << std::endl;
    auto orResult = bitmap1 | bitmap2;
    std::cout << "bitmap1 | bitmap2 : " << orResult.count() << std::endl;
}

运行结果：

=== 基础 Bitmap 示例 ===
位置 10: 1
位置 10: 1
位置 25: 0
元素数量: 3
元素数量: 3
元素数量: 2
bitmap1 & bitmap2 : 1
bitmap1 | bitmap2 : 4

2.2. 动态扩容

#include <cmath>
#include <cstdint>
#include <stdexcept>
#include <string>
#include <vector>

class DynamicBitmap
{
private:
    std::vector<uint8_t> bits_;
    size_t size_;

public:
    DynamicBitmap(size_t init_size = 64)
      : size_(init_size)
    {
        // 计算需要的字节数，向上取整: size/8
        // 注意：这里必须除以8.0，才能获取浮点数然后向上取整
        bits_.resize(std::ceil(size_ / 8.0), 0);
    }

    ~DynamicBitmap() = default;

    /**
     * 设置指定位置为 1
     */
    void set(size_t position)
    {
        ensureCapacity(position + 1);

        size_t byteIndex = position >> 3; // 等价于 position / 8
        size_t bitIndex = position & 0x7; // 等价于 position % 8
        bits_[byteIndex] |= (1 << bitIndex);
    }

    /**
     * 动态扩容
     */
    void ensureCapacity(size_t minSize)
    {
        if (minSize <= size_)
            return;

        size_ = std::max(size_ * 2, minSize);
        bits_.resize(size_);
    }
    
    // 其他方法与Bitmap类似...
};

2.3. 压缩位图

#include <cmath>
#include <cstdint>
#include <stdexcept>
#include <string>
#include <vector>

class RoaringBitmap
{
private:
    size_t block_size_{ 0 };
    std::unordered_map<size_t, Bitmap> container_;

public:
    RoaringBitmap(size_t blockSize)
      : block_size_(blockSize)
    {
    }

    ~RoaringBitmap() = default;

public:
    size_t block_size() const { return block_size_; }

    /**
     * 设置指定位置为 1
     */
    void set(size_t position)
    {
        size_t key = position / block_size_;
        size_t bitPosition = position % block_size_;

        if (!container_.count(key))
        {
            // block不存在则添加新的block
            container_.emplace(key, Bitmap(block_size_));
        }
        container_.at(key).set(bitPosition);
    }

    /**
     * 获取指定位置的值
     */
    bool get(size_t position) const
    {
        size_t key = position / block_size_;
        size_t bitPosition = position % block_size_;

        auto itr = container_.find(key);
        if (itr != container_.end())
        {
            return false;
        }
        return itr->second.get(bitPosition);
    }

    /**
     * 重载 [] 操作符
     */
    bool operator[](size_t position) const { return get(position); }

    /**
     * 设置指定位置为 0
     */
    void reset(size_t position)
    {
        size_t key = position / block_size_;
        size_t bitPosition = position % block_size_;

        auto itr = container_.find(key);
        if (itr != container_.end())
        {
            throw std::out_of_range("Position " + std::to_string(position) + " not in container");
        }

        itr->second.reset(position);
    }

    /**
     * @brief 获取数值为1的位数
     *
     * @return size_t
     */
    size_t count()
    {
        size_t count = 0;
        for (auto itr = container_.begin(); itr != container_.end(); itr++)
        {
            count += itr->second.count();
        }
        return count;
    }

    /**
     * 获取 Bitmap 的大小（bit 数）
     */
    size_t size() const
    {
        size_t size = 0;
        for (auto itr = container_.begin(); itr != container_.end(); itr++)
        {
            size += itr->second.size();
        }
        return size;
    }

    /**
     * 与另一个 Bitmap 进行 AND 操作
     */
    RoaringBitmap operator&(const RoaringBitmap& other) const
    {
        if (block_size() != other.block_size())
        {
            throw std::invalid_argument("RoaringBitmap must have same block_size");
        }
        if (container_.size() != other.container_.size())
        {
            throw std::invalid_argument("RoaringBitmap must have same size");
        }
        RoaringBitmap rBitmap(block_size_);
        for (auto itr = container_.begin(); itr != container_.end(); itr++)
        {
            auto& key = itr->first;
            auto& val = itr->second;

            if (other.container_.count(key))
            {
                Bitmap result = val & other.container_.at(key);
                rBitmap.container_.emplace(key, result);
            }
        }

        return rBitmap;
    }

    /**
     * 与另一个 Bitmap 进行 OR 操作
     */
    RoaringBitmap operator|(const RoaringBitmap& other) const
    {
        if (block_size() != other.block_size())
        {
            throw std::invalid_argument("RoaringBitmap must have same block_size");
        }
        if (container_.size() != other.container_.size())
        {
            throw std::invalid_argument("RoaringBitmap must have same size");
        }
        RoaringBitmap rBitmap(block_size_);
        for (auto itr = container_.begin(); itr != container_.end(); itr++)
        {
            auto& key = itr->first;
            auto& val = itr->second;

            if (other.container_.count(key))
            {
                Bitmap result = val | other.container_.at(key);
                rBitmap.container_.emplace(key, result);
            }
            else
            {
                rBitmap.container_.emplace(key, val);
            }
        }

        for (auto itr = other.container_.begin(); itr != other.container_.end(); itr++)
        {
            auto& key = itr->first;
            auto& val = itr->second;
            if (!rBitmap.container_.count(key))
            {
                rBitmap.container_.emplace(key, val);
            }
        }

        return rBitmap;
    }

    /**
     * 判断是否相等
     */
    bool operator==(const RoaringBitmap& other) const
    {
        if (block_size() != other.block_size())
        {
            return false;
        }
        if (container_.size() != other.container_.size())
        {
            return false;
        }
        for (auto itr = container_.begin(); itr != container_.end(); itr++)
        {
            auto& key = itr->first;
            auto& val = itr->second;
            if (!other.container_.count(key))
            {
                return false;
            }
            if (val != other.container_.at(key))
            {
                return false;
            }
        }
        return true;
    }
};

2.4. 对比分析

对比分析：

特性维度	基础 Bitmap	动态扩容 Bitmap	RoaringBitmap
内存分配	固定大小，初始化时确定	动态调整，按需扩容	分块存储，稀疏数据压缩
内存效率	数据密集时高效	数据密集时高效，但有扩容开销	稀疏数据时极高，密集数据接近基础版
存储密度	始终 O(n)	始终 O(n)	稀疏时 O(k)，密集时 O(n)
访问性能	O(1) 直接寻址	O(1)，扩容时可能短暂阻塞	O(1) 或 O(log k)，k为容器数
集合操作	极快，位运算	极快，位运算	快速，容器级并行
适用数据分布	密集、连续数据	数据量不确定但分布密集	稀疏、不均匀数据
典型应用场景	用户ID状态管理、固定范围标记	动态用户系统、渐进式数据收集	大规模稀疏数据、数据仓库查询

选择建议：

数据密集且范围已知 → 基础 Bitmap
数据密集但范围未知 → 动态扩容 Bitmap
数据稀疏或分布不均 → RoaringBitmap
需要高效集合运算 → 三者均可，RoaringBitmap在大数据量时更优

3. Bitmap 应用场景

3.1. 4.1 大数据去重

#include <memory>

class DeduplicationService {
private:
    std::unique_ptr<Bitmap> bitmap_;
    
public:
    DeduplicationService(size_t maxUserId) {
        bitmap_ = std::make_unique<Bitmap>(maxUserId);
    }
    
    /**
     * 判断用户是否已经处理过
     */
    bool isProcessed(size_t userId) {
        return bitmap_->get(userId);
    }
    
    /**
     * 标记用户已处理
     */
    void markProcessed(size_t userId) {
        bitmap_->set(userId);
    }
    
    /**
     * 获取已处理的用户数量
     */
    size_t getProcessedCount() {
        return bitmap_->count();
    }
};

3.2. 4.2 用户标签系统

#include <unordered_map>
#include <vector>
#include <string>
#include <memory>

class UserTagSystem {
private:
    std::unordered_map<std::string, std::unique_ptr<Bitmap>> tagBitmaps_;
    size_t totalUsers_;
    
public:
    UserTagSystem(size_t totalUsers) : totalUsers_(totalUsers) {}
    
    /**
     * 为用户添加标签
     */
    void addTag(size_t userId, const std::string& tag) {
        auto it = tagBitmaps_.find(tag);
        if (it == tagBitmaps_.end()) {
            // 标签不存在，创建新的 Bitmap
            tagBitmaps_[tag] = std::make_unique<Bitmap>(totalUsers_);
        }
        tagBitmaps_[tag]->set(userId);
    }
    
    /**
     * 移除用户的标签
     */
    void removeTag(size_t userId, const std::string& tag) {
        auto it = tagBitmaps_.find(tag);
        if (it != tagBitmaps_.end()) {
            it->second->reset(userId);
        }
    }
    
    /**
     * 获取具有某个标签的所有用户
     */
    std::vector<size_t> getUsersWithTag(const std::string& tag) {
        std::vector<size_t> users;
        auto it = tagBitmaps_.find(tag);
        if (it != tagBitmaps_.end()) {
            Bitmap* bitmap = it->second.get();
            for (size_t i = 0; i < totalUsers_; i++) {
                if (bitmap->get(i)) {
                    users.push_back(i);
                }
            }
        }
        return users;
    }
    
    /**
     * 获取同时具有多个标签的用户
     */
    std::vector<size_t> getUsersWithTags(const std::vector<std::string>& tags) {
        if (tags.empty()) return std::vector<size_t>();
        
        // 找到第一个存在的标签 Bitmap
        auto firstIt = tagBitmaps_.find(tags[0]);
        if (firstIt == tagBitmaps_.end()) {
            return std::vector<size_t>();
        }
        
        std::unique_ptr<Bitmap> result = std::make_unique<Bitmap>(*(firstIt->second));
        
        // 计算所有标签的交集
        for (size_t i = 1; i < tags.size(); i++) {
            auto it = tagBitmaps_.find(tags[i]);
            if (it == tagBitmaps_.end()) {
                return std::vector<size_t>(); // 标签不存在，没有交集
            }
            
            Bitmap intersection = *result & *(it->second);
            result = std::make_unique<Bitmap>(intersection);
            
            // 如果交集为空，提前返回
            if (result->count() == 0) {
                return std::vector<size_t>();
            }
        }
        
        // 收集结果用户ID
        std::vector<size_t> users;
        for (size_t i = 0; i < totalUsers_; i++) {
            if (result->get(i)) {
                users.push_back(i);
            }
        }
        return users;
    }
    
    /**
     * 获取用户的所有标签
     */
    std::vector<std::string> getUserTags(size_t userId) {
        std::vector<std::string> userTags;
        for (const auto& pair : tagBitmaps_) {
            if (pair.second->get(userId)) {
                userTags.push_back(pair.first);
            }
        }
        return userTags;
    }
    
    /**
     * 获取标签数量
     */
    size_t getTagCount() const {
        return tagBitmaps_.size();
    }
    
    /**
     * 检查标签是否存在
     */
    bool hasTag(const std::string& tag) const {
        return tagBitmaps_.find(tag) != tagBitmaps_.end();
    }
};

3.3. 4.3 布隆过滤器（Bloom Filter）

#include <unordered_map>
#include <vector>
#include <string>
#include <memory>

class UserTagSystem {
private:
    std::unordered_map<std::string, std::unique_ptr<Bitmap>> tagBitmaps_;
    size_t totalUsers_;
    
public:
    UserTagSystem(size_t totalUsers) : totalUsers_(totalUsers) {}
    
    /**
     * 为用户添加标签
     */
    void addTag(size_t userId, const std::string& tag) {
        auto it = tagBitmaps_.find(tag);
        if (it == tagBitmaps_.end()) {
            // 标签不存在，创建新的 Bitmap
            tagBitmaps_[tag] = std::make_unique<Bitmap>(totalUsers_);
        }
        tagBitmaps_[tag]->set(userId);
    }
    
    /**
     * 移除用户的标签
     */
    void removeTag(size_t userId, const std::string& tag) {
        auto it = tagBitmaps_.find(tag);
        if (it != tagBitmaps_.end()) {
            it->second->reset(userId);
        }
    }
    
    /**
     * 获取具有某个标签的所有用户
     */
    std::vector<size_t> getUsersWithTag(const std::string& tag) {
        std::vector<size_t> users;
        auto it = tagBitmaps_.find(tag);
        if (it != tagBitmaps_.end()) {
            Bitmap* bitmap = it->second.get();
            for (size_t i = 0; i < totalUsers_; i++) {
                if (bitmap->get(i)) {
                    users.push_back(i);
                }
            }
        }
        return users;
    }
    
    /**
     * 获取同时具有多个标签的用户
     */
    std::vector<size_t> getUsersWithTags(const std::vector<std::string>& tags) {
        if (tags.empty()) return std::vector<size_t>();
        
        // 找到第一个存在的标签 Bitmap
        auto firstIt = tagBitmaps_.find(tags[0]);
        if (firstIt == tagBitmaps_.end()) {
            return std::vector<size_t>();
        }
        
        std::unique_ptr<Bitmap> result = std::make_unique<Bitmap>(*(firstIt->second));
        
        // 计算所有标签的交集
        for (size_t i = 1; i < tags.size(); i++) {
            auto it = tagBitmaps_.find(tags[i]);
            if (it == tagBitmaps_.end()) {
                return std::vector<size_t>(); // 标签不存在，没有交集
            }
            
            Bitmap intersection = *result & *(it->second);
            result = std::make_unique<Bitmap>(intersection);
            
            // 如果交集为空，提前返回
            if (result->count() == 0) {
                return std::vector<size_t>();
            }
        }
        
        // 收集结果用户ID
        std::vector<size_t> users;
        for (size_t i = 0; i < totalUsers_; i++) {
            if (result->get(i)) {
                users.push_back(i);
            }
        }
        return users;
    }
    
    /**
     * 获取用户的所有标签
     */
    std::vector<std::string> getUserTags(size_t userId) {
        std::vector<std::string> userTags;
        for (const auto& pair : tagBitmaps_) {
            if (pair.second->get(userId)) {
                userTags.push_back(pair.first);
            }
        }
        return userTags;
    }
    
    /**
     * 获取标签数量
     */
    size_t getTagCount() const {
        return tagBitmaps_.size();
    }
    
    /**
     * 检查标签是否存在
     */
    bool hasTag(const std::string& tag) const {
        return tagBitmaps_.find(tag) != tagBitmaps_.end();
    }
};