@@ -104,11 +104,52 @@ bool mllm::BasicTokenizer::is_chinese_char(wchar_t cp) {
104104 (cp >= 0x20000 && cp <= 0x2A6DF );
105105}
106106
107+ std::vector<std::wstring> splitBySet (const std::wstring& text, const std::unordered_set<std::wstring>& words) {
108+ std::vector<std::wstring> result;
109+ size_t pos = 0 ;
110+
111+ while (pos < text.length ()) {
112+ size_t minPos = std::wstring::npos;
113+ std::wstring foundWord;
114+
115+ // 查找最近的匹配项
116+ for (const auto & word : words) {
117+ size_t found = text.find (word, pos);
118+ if (found != std::wstring::npos && (found < minPos)) {
119+ minPos = found;
120+ foundWord = word;
121+ }
122+ }
123+
124+ // 如果找到匹配项,处理之前的文本和匹配项
125+ if (minPos != std::wstring::npos) {
126+ if (minPos > pos) {
127+ // 添加匹配项前的文本
128+ result.push_back (text.substr (pos, minPos - pos));
129+ }
130+ // 添加匹配项
131+ result.push_back (foundWord);
132+ pos = minPos + foundWord.size ();
133+ } else {
134+ // 没有更多匹配项,添加剩余所有文本
135+ result.push_back (text.substr (pos));
136+ break ;
137+ }
138+ }
139+
140+ return result;
141+ }
142+
107143std::vector<std::wstring> mllm::BasicTokenizer::tokenize (const std::wstring& text) {
108144 std::wstring cleaned = clean_text (text);
109145 if (_tokenize_chinese_chars)
110146 cleaned = tokenize_chinese_chars (cleaned);
111- std::vector<std::wstring> split_tokens = whitespace_tokenize (cleaned);
147+ std::vector<std::wstring> white_space_splited_tokens = whitespace_tokenize (cleaned);
148+ std::vector<std::wstring> split_tokens;
149+ for (const auto & token : white_space_splited_tokens) {
150+ auto sub_tokens = splitBySet (token, never_split);
151+ split_tokens.insert (split_tokens.end (), sub_tokens.begin (), sub_tokens.end ());
152+ }
112153 std::vector<std::wstring> output_tokens;
113154
114155 for (auto & token : split_tokens) {
@@ -138,6 +179,9 @@ std::vector<std::wstring> mllm::BasicTokenizer::tokenize(const std::wstring& tex
138179
139180 return output_tokens;
140181}
182+ void mllm::BasicTokenizer::add_never_split (const std::wstring &token) {
183+ never_split.insert (token);
184+ }
141185
142186void mllm::WordPieceTokenizer::tokenize (const string &text, vector<token_id_t > &tokens, bool bos) {
143187 auto wstr = utf8_to_wstring (text);
@@ -167,6 +211,7 @@ void mllm::WordPieceTokenizer::tokenize(const string &text, vector<token_id_t> &
167211 break ;
168212 } else {
169213 token_strs.push_back (str);
214+ // printf("word: %s\n", str.c_str());
170215 }
171216 start = end;
172217 }
@@ -177,3 +222,9 @@ void mllm::WordPieceTokenizer::tokenize(const string &text, vector<token_id_t> &
177222 tokens.push_back (vocab_map_[token_str]);
178223 }
179224}
225+ void mllm::WordPieceTokenizer::add_special_tokens (const vector<std::string> &special_tokens) {
226+ // add never split tokens to basic tokenizer
227+ for (const auto & token : special_tokens) {
228+ basic_tokenizer.add_never_split (utf8_to_wstring (token));
229+ }
230+ }
0 commit comments