Program Listing for File text_processor.cpp

Return to documentation for file (src/translator/text_processor.cpp)

#include "text_processor.h"

#include <vector>

#include "annotation.h"
#include "common/options.h"
#include "data/types.h"
#include "definitions.h"

namespace marian {
namespace bergamot {

Segment TextProcessor::tokenize(const string_view &segment, std::vector<string_view> &wordRanges) {
  // vocabs_->sources().front() is invoked as we currently only support one source vocab
  return vocabs_.sources().front()->encodeWithByteRanges(segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
}

TextProcessor::TextProcessor(Vocabs &vocabs, Ptr<Options> options) : vocabs_(vocabs), sentence_splitter_(options) {
  max_length_break_ = options->get<int>("max-length-break");
  max_length_break_ = max_length_break_ - 1;
  ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0");
}

void TextProcessor::process(AnnotatedText &source, Segments &segments) {
  string_view query = string_view(source.text);
  auto sentenceStream = sentence_splitter_.createSentenceStream(query);
  std::string_view sentenceStringPiece;

  while (sentenceStream >> sentenceStringPiece) {
    marian::string_view sentence(sentenceStringPiece.data(), sentenceStringPiece.size());

    std::vector<string_view> wordRanges;
    Segment segment = tokenize(sentence, wordRanges);

    // There are some cases where SentencePiece or vocab returns no words
    // after normalization. 0 prevents any empty entries from being added.
    if (segment.size() > 0) {
      // Wrap segment into sentences of at most max_length_break_ tokens and
      // tell source about them.
      wrap(segment, wordRanges, segments, source);
    }
  }
}

void TextProcessor::wrap(Segment &segment, std::vector<string_view> &wordRanges, Segments &segments,
                         AnnotatedText &source) {
  for (size_t offset = 0; offset < segment.size(); offset += max_length_break_) {
    auto start = segment.begin() + offset;

    size_t left = segment.size() - offset;
    size_t diff = std::min(max_length_break_, left);

    segments.emplace_back(start, start + diff);
    segments.back().push_back(sourceEosId());

    auto astart = wordRanges.begin() + offset;
    // diff > 0
    source.recordExistingSentence(astart, astart + diff, astart->data());
  }
}

}  // namespace bergamot
}  // namespace marian