Program Listing for File sentence_splitter.cpp¶
↰ Return to documentation for file (src/translator/sentence_splitter.cpp
)
#include "sentence_splitter.h"
#include <string>
#include "common/cli_helper.h"
#include "common/logging.h"
#include "common/options.h"
namespace marian {
namespace bergamot {
SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options) : options_(options) {
std::string smode_str = options_->get<std::string>("ssplit-mode", "");
mode_ = string2splitmode(smode_str);
std::string ssplit_prefix_file = options_->get<std::string>("ssplit-prefix-file", "");
if (ssplit_prefix_file.size()) {
ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
LOG(info, "Loading protected prefixes for sentence splitting from {}", ssplit_prefix_file);
ssplit_.load(ssplit_prefix_file);
} else {
LOG(warn,
"Missing list of protected prefixes for sentence splitting. "
"Set with --ssplit-prefix-file.");
}
}
ug::ssplit::SentenceStream SentenceSplitter::createSentenceStream(const string_view &input) {
std::string_view input_converted(input.data(), input.size());
return std::move(ug::ssplit::SentenceStream(input_converted, this->ssplit_, mode_));
}
ug::ssplit::SentenceStream::splitmode SentenceSplitter::string2splitmode(const std::string &m) {
typedef ug::ssplit::SentenceStream::splitmode splitmode;
// @TODO: throw Exception on error
if (m == "sentence" || m == "Sentence") return splitmode::one_sentence_per_line;
if (m == "paragraph" || m == "Paragraph") return splitmode::one_paragraph_per_line;
if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
LOG(warn, "Ignoring unknown text input format specification: {}.", m);
}
return splitmode::wrapped_text;
}
} // namespace bergamot
} // namespace marian