Program Listing for File text_processor.h¶
↰ Return to documentation for file (src/translator/text_processor.h
)
#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
#define SRC_BERGAMOT_TEXT_PROCESSOR_H_
#include <vector>
#include "annotation.h"
#include "data/types.h"
#include "data/vocab.h"
#include "definitions.h"
#include "sentence_splitter.h"
#include "vocabs.h"
namespace marian {
namespace bergamot {
class TextProcessor {
// TextProcessor handles loading the sentencepiece vocabulary and also
// contains an instance of sentence-splitter based on ssplit.
//
// Used in Service to convert an incoming blog of text to a vector of
// sentences (vector of words). In addition, the ByteRanges of the
// source-tokens in unnormalized text are provided as string_views.
public:
explicit TextProcessor(Vocabs &vocabs, Ptr<Options>);
void process(AnnotatedText &source, Segments &segments);
private:
// Tokenizes an input string, returns Words corresponding. Loads the
// corresponding byte-ranges into tokenRanges.
Segment tokenize(const string_view &input, std::vector<string_view> &tokenRanges);
// Wrap into sentences of at most max_length_break_ tokens and add to source.
void wrap(Segment &sentence, std::vector<string_view> &tokenRanges, Segments &segments, AnnotatedText &source);
// shorthand, used only in truncate()
// vocabs_->sources().front() is invoked as we currently only support one source vocab
const Word sourceEosId() const { return vocabs_.sources().front()->getEosId(); }
const Vocabs &vocabs_;
SentenceSplitter sentence_splitter_;
size_t max_length_break_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_