|
|
// Copyright (c) 2021 Mobvoi Inc (Binbin Zhang)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "wn_string.h"
#include <sstream>
#include <string>
#include <vector>
#include "log.h"
#include "wn_utils.h"
namespace wenet {
void SplitString(const std::string& str, std::vector<std::string>* strs) { SplitStringToVector(Trim(str), " \t", true, strs); }
void SplitStringToVector(const std::string& full, const char* delim, bool omit_empty_strings, std::vector<std::string>* out) { size_t start = 0, found = 0, end = full.size(); out->clear(); while (found != std::string::npos) { found = full.find_first_of(delim, start); // start != end condition is for when the delimiter is at the end
if (!omit_empty_strings || (found != start && start != end)) out->push_back(full.substr(start, found - start)); start = found + 1; } }
void SplitUTF8StringToChars(const std::string& str, std::vector<std::string>* chars) { chars->clear(); int bytes = 1; for (size_t i = 0; i < str.length(); i += bytes) { assert((str[i] & 0xF8) <= 0xF0); if ((str[i] & 0x80) == 0x00) { // The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
bytes = 1; } else if ((str[i] & 0xE0) == 0xC0) { // The next 1,920 characters need two bytes to encode,
// which covers the remainder of almost all Latin-script alphabets.
bytes = 2; } else if ((str[i] & 0xF0) == 0xE0) { // Three bytes are needed for characters in the rest of
// the Basic Multilingual Plane, which contains virtually all characters
// in common use, including most Chinese, Japanese and Korean characters.
bytes = 3; } else if ((str[i] & 0xF8) == 0xF0) { // Four bytes are needed for characters in the other planes of Unicode,
// which include less common CJK characters, various historic scripts,
// mathematical symbols, and emoji (pictographic symbols).
bytes = 4; } chars->push_back(str.substr(i, bytes)); } }
int UTF8StringLength(const std::string& str) { int len = 0; int bytes = 1; for (size_t i = 0; i < str.length(); i += bytes) { if ((str[i] & 0x80) == 0x00) { bytes = 1; } else if ((str[i] & 0xE0) == 0xC0) { bytes = 2; } else if ((str[i] & 0xF0) == 0xE0) { bytes = 3; } else if ((str[i] & 0xF8) == 0xF0) { bytes = 4; } ++len; } return len; }
bool CheckEnglishChar(const std::string& ch) { // all english characters should be encoded in one byte
if (ch.size() != 1) return false; // english words may contain apostrophe, i.e., "He's"
return isalpha(ch[0]) || ch[0] == '\''; }
bool CheckEnglishWord(const std::string& word) { std::vector<std::string> chars; SplitUTF8StringToChars(word, &chars); for (size_t k = 0; k < chars.size(); k++) { if (!CheckEnglishChar(chars[k])) { return false; } } return true; }
std::string JoinString(const std::string& c, const std::vector<std::string>& strs) { std::string result; if (strs.size() > 0) { for (int i = 0; i < strs.size() - 1; i++) { result += (strs[i] + c); } result += strs.back(); } return result; }
bool IsAlpha(const std::string& str) { for (size_t i = 0; i < str.size(); i++) { if (!isalpha(str[i])) { return false; } } return true; }
std::string ProcessBlank(const std::string& str, bool lowercase) { std::string result; if (!str.empty()) { std::vector<std::string> chars; SplitUTF8StringToChars(Trim(str), &chars);
for (std::string& ch : chars) { if (ch != kSpaceSymbol) { result.append(ch); } else { // Ignore consecutive space or located in head
if (!result.empty() && result.back() != ' ') { result.push_back(' '); } } } // Ignore tailing space
if (!result.empty() && result.back() == ' ') { result.pop_back(); } // NOTE: convert string to wstring
// see issue 745: https://github.com/wenet-e2e/wenet/issues/745
try { std::locale loc(""); std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter; std::wstring wsresult = converter.from_bytes(result); for (auto& c : wsresult) { c = lowercase ? tolower(c, loc) : c; } result = converter.to_bytes(wsresult); } catch (std::exception& e) { LOG(ERROR) << "convert wstring error " << e.what(); } } return result; }
std::string Ltrim(const std::string& str) { size_t start = str.find_first_not_of(WHITESPACE); return (start == std::string::npos) ? "" : str.substr(start); }
std::string Rtrim(const std::string& str) { size_t end = str.find_last_not_of(WHITESPACE); return (end == std::string::npos) ? "" : str.substr(0, end + 1); }
std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }
std::string JoinPath(const std::string& left, const std::string& right) { std::string path(left); if (path.size() && path.back() != '/') { path.push_back('/'); } path.append(right); return path; }
#ifdef _MSC_VER
std::wstring ToWString(const std::string& str) { unsigned len = str.size() * 2; setlocale(LC_CTYPE, ""); wchar_t* p = new wchar_t[len]; mbstowcs(p, str.c_str(), len); std::wstring wstr(p); delete[] p; return wstr; } #endif
} // namespace wenet
|