feat: fix tokenizer, add average pooling to bert

XieWeikai · XieWeikai · commit 77fc3b9187c9 · 2024-10-29T19:52:51.000+08:00
diff --git a/examples/demo_bert.cpp b/examples/demo_bert.cpp
@@ -8,16 +8,25 @@
 string vocab_file = "./vocab/gte_vocab.mllm";
 string model_file = "./models/gte-small-fp32.mllm";
 
+/*
+ * an intent to support gte-small BertModel to do text embedding
+ * current implementation is just a very basic example with a simple WordPiece tokenizer and a simple BertModel
+ * not support batch embedding
+ * */
+
 int main(int argc, char *argv[]) {
-    BertTokenizer tokenizer(vocab_file, false);
-    string text = "Hello, my dog is cute.";
+    BertTokenizer tokenizer(vocab_file, true);
+    string text = "Help me set an alarm at 21:30";
     auto [token_ids, type_ids, position_ids] = tokenizer.process(text);
     // token_ids.printData<float>();
 
     auto config = BertConfig();
     auto model = BertModel(config);
     model.load(model_file);
 
-    auto res = model({token_ids, type_ids, position_ids});
-    res[0].printData<float>();
+    auto res = model({token_ids, type_ids, position_ids})[0];
+
+    res.printData<float>();
+
+    return 0;
 }
diff --git a/src/models/bert/configuration_bert.hpp b/src/models/bert/configuration_bert.hpp
@@ -48,6 +48,7 @@ struct BertConfig : public TransformerConfig {
         bos_token_id = 151643;
         eos_token_id = 151645;
         hidden_act = "GELU";
+        pooling_type = "mean";
         hidden_size = 384;
         initializer_range = 0.02;
         intermediate_size = 1536;
@@ -73,6 +74,7 @@ struct BertConfig : public TransformerConfig {
     int bos_token_id = 151643;
     int eos_token_id = 151643;
     std::string hidden_act = "GELU";
+    std::string pooling_type = "mean";
     int hidden_size = 1024;
     float initializer_range = 0.02;
     int intermediate_size = 2816;
diff --git a/src/models/bert/modeling_bert.hpp b/src/models/bert/modeling_bert.hpp
@@ -74,11 +74,30 @@ class BertLayer : public Module {
     Layer attn_norm, ff_norm;
 };
 
+
+class AvgPooler : public Module{
+public:
+    AvgPooler() = default;
+    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
+        auto x = inputs[0];
+        x = x.mean(SEQUENCE);
+        return {x};
+    }
+};
+
 class BertModel : public Module {
 public:
     BertModel(BertConfig &config) {
         embeddings = BertEmbeddings(config.vocab_size, config.hidden_size, config.type_vocab_size, config.max_position_embeddings, config.layer_norm_eps, config.names_config);
         layers = List<BertLayer>(config.num_hidden_layers, config, "encoder.layer.");
+
+        if(config.pooling_type == "mean") {
+            pooler = make_unique<AvgPooler>();
+        }else {
+            // print not support pooling type and exit
+            std::cout << "Not support pooling type: " << config.pooling_type << std::endl;
+            exit(0);
+        }
     }
 
     std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
@@ -88,12 +107,16 @@ class BertModel : public Module {
             x = layer({x})[0];
         }
 
+        x = (*pooler)({x})[0];
+
         return {x};
     }
 
 private:
     BertEmbeddings embeddings;
     std::vector<BertLayer> layers;
+
+    unique_ptr<Module> pooler;
 };
 
 #endif //! MODELING_BERT_HPP
diff --git a/src/models/bert/tokenization_bert.hpp b/src/models/bert/tokenization_bert.hpp
@@ -16,13 +16,23 @@ using namespace mllm;
 
 class BertTokenizer final : public WordPieceTokenizer {
 public:
-    explicit BertTokenizer(const std::string &vocab_file, bool bos = true) :
+    explicit BertTokenizer(const std::string &vocab_file, bool add_special_tokens = true) :
         WordPieceTokenizer(vocab_file) {
         Module::initBackend(MLLM_CPU);
+        _add_special_tokens = add_special_tokens;
+        this->add_special_tokens({"[PAD]", "[CLS]", "[SEP]", "[MASK]"});
     }
-    std::tuple<Tensor, Tensor, Tensor> process(std::string &text){
+    std::tuple<Tensor, Tensor, Tensor> process(std::string text){
+        if (_add_special_tokens) {
+            text = "[CLS] " + text + " [SEP]";
+        }
         auto tokens_id = vector<token_id_t>();
         WordPieceTokenizer::tokenize(text, tokens_id, false);
+//        printf("token: ");
+//        for (auto &token_id : tokens_id) {
+//            printf("%d ", token_id);
+//        }
+        printf("\n");
         auto tokens_type = vector<token_id_t>(tokens_id.size(), 0);
         auto position_ids = vector<token_id_t>(tokens_id.size());
         for (size_t i = 0; i < tokens_id.size(); i++) {
@@ -34,6 +44,9 @@ class BertTokenizer final : public WordPieceTokenizer {
             tokens2Input(position_ids, "input_position_ids")
         };
     }
+
+private:
+    bool _add_special_tokens;
 };
 
 #endif //! TOKENIZATION_BERT_HPP
diff --git a/src/tokenizers/WordPiece/WordPiece.cpp b/src/tokenizers/WordPiece/WordPiece.cpp
@@ -104,11 +104,52 @@ bool mllm::BasicTokenizer::is_chinese_char(wchar_t cp) {
            (cp >= 0x20000 && cp <= 0x2A6DF);
 }
 
+std::vector<std::wstring> splitBySet(const std::wstring& text, const std::unordered_set<std::wstring>& words) {
+    std::vector<std::wstring> result;
+    size_t pos = 0;
+
+    while (pos < text.length()) {
+        size_t minPos = std::wstring::npos;
+        std::wstring foundWord;
+
+        // 查找最近的匹配项
+        for (const auto& word : words) {
+            size_t found = text.find(word, pos);
+            if (found != std::wstring::npos && (found < minPos)) {
+                minPos = found;
+                foundWord = word;
+            }
+        }
+
+        // 如果找到匹配项，处理之前的文本和匹配项
+        if (minPos != std::wstring::npos) {
+            if (minPos > pos) {
+                // 添加匹配项前的文本
+                result.push_back(text.substr(pos, minPos - pos));
+            }
+            // 添加匹配项
+            result.push_back(foundWord);
+            pos = minPos + foundWord.size();
+        } else {
+            // 没有更多匹配项，添加剩余所有文本
+            result.push_back(text.substr(pos));
+            break;
+        }
+    }
+
+    return result;
+}
+
 std::vector<std::wstring> mllm::BasicTokenizer::tokenize(const std::wstring& text) {
     std::wstring cleaned = clean_text(text);
     if (_tokenize_chinese_chars)
         cleaned = tokenize_chinese_chars(cleaned);
-    std::vector<std::wstring> split_tokens = whitespace_tokenize(cleaned);
+    std::vector<std::wstring> white_space_splited_tokens = whitespace_tokenize(cleaned);
+    std::vector<std::wstring> split_tokens;
+    for (const auto& token : white_space_splited_tokens) {
+        auto sub_tokens = splitBySet(token, never_split);
+        split_tokens.insert(split_tokens.end(), sub_tokens.begin(), sub_tokens.end());
+    }
     std::vector<std::wstring> output_tokens;
 
     for (auto& token : split_tokens) {
@@ -138,6 +179,9 @@ std::vector<std::wstring> mllm::BasicTokenizer::tokenize(const std::wstring& tex
 
     return output_tokens;
 }
+void mllm::BasicTokenizer::add_never_split(const std::wstring &token) {
+    never_split.insert(token);
+}
 
 void mllm::WordPieceTokenizer::tokenize(const string &text, vector<token_id_t> &tokens, bool bos) {
     auto wstr = utf8_to_wstring(text);
@@ -167,6 +211,7 @@ void mllm::WordPieceTokenizer::tokenize(const string &text, vector<token_id_t> &
                 break;
             } else{
                 token_strs.push_back(str);
+//                printf("word: %s\n", str.c_str());
             }
             start = end;
         }
@@ -177,3 +222,9 @@ void mllm::WordPieceTokenizer::tokenize(const string &text, vector<token_id_t> &
         tokens.push_back(vocab_map_[token_str]);
     }
 }
+void mllm::WordPieceTokenizer::add_special_tokens(const vector<std::string> &special_tokens) {
+    // add never split tokens to basic tokenizer
+    for (const auto& token : special_tokens) {
+        basic_tokenizer.add_never_split(utf8_to_wstring(token));
+    }
+}
diff --git a/src/tokenizers/WordPiece/WordPiece.hpp b/src/tokenizers/WordPiece/WordPiece.hpp
@@ -33,6 +33,8 @@ class BasicTokenizer {
 
     std::vector<std::wstring> tokenize(const std::wstring& text);
 
+    void add_never_split(const std::wstring& token);
+
 private:
     bool do_lower_case;
     bool _tokenize_chinese_chars;
@@ -55,6 +57,8 @@ class WordPieceTokenizer: public Tokenizer {
 
     WordPieceTokenizer(const std::string &vocab_file);
     void tokenize(const std::string &text, std::vector<token_id_t> &tokens, bool bos) override;
+
+    void add_special_tokens(const std::vector<std::string> &special_tokens);
 };
 
 }