|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "wetext_string.h"
|
|
|
|
#include "wetext_log.h"
|
|
|
|
namespace wetext {
|
|
const char* WHITESPACE = " \n\r\t\f\v";
|
|
|
|
int UTF8CharLength(char ch) {
|
|
int num_bytes = 1;
|
|
CHECK_LE((ch & 0xF8), 0xF0);
|
|
if ((ch & 0x80) == 0x00) {
|
|
// The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
|
|
num_bytes = 1;
|
|
} else if ((ch & 0xE0) == 0xC0) {
|
|
// The next 1,920 characters need two bytes to encode,
|
|
// which covers the remainder of almost all Latin-script alphabets.
|
|
num_bytes = 2;
|
|
} else if ((ch & 0xF0) == 0xE0) {
|
|
// Three bytes are needed for characters in the rest of
|
|
// the Basic Multilingual Plane, which contains virtually all characters
|
|
// in common use, including most Chinese, Japanese and Korean characters.
|
|
num_bytes = 3;
|
|
} else if ((ch & 0xF8) == 0xF0) {
|
|
// Four bytes are needed for characters in the other planes of Unicode,
|
|
// which include less common CJK characters, various historic scripts,
|
|
// mathematical symbols, and emoji (pictographic symbols).
|
|
num_bytes = 4;
|
|
}
|
|
return num_bytes;
|
|
}
|
|
|
|
int UTF8StringLength(const std::string& str) {
|
|
int len = 0;
|
|
int num_bytes = 1;
|
|
for (size_t i = 0; i < str.length(); i += num_bytes) {
|
|
num_bytes = UTF8CharLength(str[i]);
|
|
++len;
|
|
}
|
|
return len;
|
|
}
|
|
|
|
void SplitUTF8StringToChars(const std::string& str,
|
|
std::vector<std::string>* chars) {
|
|
chars->clear();
|
|
int num_bytes = 1;
|
|
for (size_t i = 0; i < str.length(); i += num_bytes) {
|
|
num_bytes = UTF8CharLength(str[i]);
|
|
chars->push_back(str.substr(i, num_bytes));
|
|
}
|
|
}
|
|
|
|
std::string Ltrim(const std::string& str) {
|
|
size_t start = str.find_first_not_of(WHITESPACE);
|
|
return (start == std::string::npos) ? "" : str.substr(start);
|
|
}
|
|
|
|
std::string Rtrim(const std::string& str) {
|
|
size_t end = str.find_last_not_of(WHITESPACE);
|
|
return end == std::string::npos ? "" : str.substr(0, end + 1);
|
|
}
|
|
|
|
std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }
|
|
|
|
void Split(const std::string& str, const std::string& delim,
|
|
std::vector<std::string>* output) {
|
|
std::string s = str;
|
|
size_t pos = 0;
|
|
while ((pos = s.find(delim)) != std::string::npos) {
|
|
output->emplace_back(s.substr(0, pos));
|
|
s.erase(0, pos + delim.length());
|
|
}
|
|
output->emplace_back(s);
|
|
}
|
|
|
|
} // namespace wetext
|