LT Project: Parsing Reviews

The reviews downloaded as part of the LT records are the creation of one person, myself. As such, although they have a theoretically extensive HTML format model, in practice that is limited to a few tags. For reasonable display we still need to parse that formatting.

The ReviewFormatter function formatReview() call is short:

void ReviewFormatter::formatReview(const std::string &inValue) const

{

gtk_text_buffer_get_end_iter(m_buffer,m_iter);

m_sink->insertCategory("Review");

std::string val(

boost::algorithm::replace_all_copy(inValue, "[return]", "\n"));

boost::algorithm::ierase_all(val, "<p>");

boost::algorithm::ierase_all(val, "</p>");

std::size_t off = val.find("<");

if (off != std::string::npos)

{

TextBlockSource source(val, off);

std::copy(source.begin(), source.end(),

boost::make_function_output_iterator(

ReviewUpdater(m_buffer, m_iter)));

}

else

m_sink->insertSmallText(inValue);

m_sink->insertPlainText("\n");

}

but it has significant dependencies which are used only for this purpose (at least at present). They're not formally part of the class, but separate classes, frequently to enable testability. (Classes which know about GTK are not easily testable). Let's work downward through the dependencies.

TextBlockSource() provides a forward iterator interface for a block of text with markup. Iterating over it returns objects of the type TaggedText.

class TextBlockSource

{

public:

class iterator

{

public:

using iterator_category = std::input_iterator_tag;

using difference_type = std::ptrdiff_t;

using value_type = TextBlockWithType;

using pointer = value_type *;

using reference = value_type &;

explicit iterator(std::shared_ptr<TextBlockWithType> inBlock):

m_block(inBlock), m_type(static_cast<int>(TaggedText::Type::initial))

{ }

iterator(): m_type(static_cast<int>(TaggedText::Type::end)) {}

bool operator==(const iterator &inOther) const

{

return m_type == inOther.m_type;

}

bool operator!=(const iterator &inOther) const

{

return m_type != inOther.m_type;

}

TaggedText operator*()

{

if (m_block.get())

{

TaggedText rval = (*m_block)();

m_type = rval.type;

return rval;

}

else

return TaggedText{ nullptr, static_cast<int>(TaggedText::Type::end) };

}

iterator &operator++() { return *this; }

private:

std::shared_ptr<TextBlockWithType> m_block;

int m_type;

};

TextBlockSource(std::string_view inVal, std::size_t inOffset):

m_block(new TextBlockWithType(inVal, inOffset))

{ }

iterator begin() { return iterator(m_block); }

iterator end() { return iterator(); }

private:

std::shared_ptr<TextBlockWithType> m_block;

};

TaggedText is defined as

struct TaggedText

{

enum class Type

{

text = 0,

italic = 1,

bold = 2,

blockquote = 4,

error = 8,

initial = 16,

end = 32

};

const char *data;

int type;

};

(Why use a const char* rather than a string_view? Because the text is going to end up being passed to a C function at the GTK level.) This is essentially a wrapper around TextBlockWithType:

class TextBlockWithType

{

public:

TextBlockWithType(): m_state(static_cast<int>(TaggedText::Type::end)) {}

~TextBlockWithType();

TextBlockWithType(std::string_view inVal, const std::size_t inOffset);

TextBlockWithType(const TextBlockWithType &inOther):

m_remaining(inOther.m_remaining), m_current(inOther.m_current),

m_state(inOther.m_state)

{ }

TextBlockWithType &operator=(const TextBlockWithType &inOther)

{

if (this == &inOther)

return *this;

m_remaining = inOther.m_remaining;

m_current = inOther.m_current;

m_state = inOther.m_state;

return *this;

}

TextBlockWithType &operator=(TextBlockWithType &&inOther)

{

if (this == &inOther)

return *this;

m_remaining = inOther.m_remaining;

m_current = std::move(inOther.m_current);

m_state = inOther.m_state;

return *this;

}

TextBlockWithType(TextBlockWithType &&inOther):

m_remaining(inOther.m_remaining), m_current(inOther.m_current),

m_state(inOther.m_state)

{ }

TaggedText operator()();

private:

TaggedText handleIntermediateState();

void transitionToError(std::size_t inSize, const bool inReset)

{

m_error = std::string(m_remaining.substr(0, inSize));

if (inReset)

m_state = static_cast<int>(static_cast<int>(TaggedText::Type::error));

else

m_state += static_cast<int>(static_cast<int>(TaggedText::Type::error));

}

std::string_view m_remaining;

std::string m_current;

std::string m_error;

int m_state;

inline static const char nullrval[] = "";

};

TextBlockWithType does use a string_view to manage the interned data during a parse.

The constructor sets up an initial value (which will be plain text) if provided with an offset; otherwise the type of the first block of text is determined from the opening tag:

namespace

{

TagTransitionParser theParser;

}

...

TextBlockWithType::TextBlockWithType(std::string_view inVal,

const std::size_t inOffset):

m_remaining(inVal),

m_state(static_cast<int>(TaggedText::Type::initial))

{

if (inOffset == 0)

{

auto [type, start, size] = theParser.ParseNextTag(m_remaining);

if (type == TaggedText::Type::error)

{

transitionToError(size, true);

}

else if (!start)

{

transitionToError(size, true);

}

else

m_state = static_cast<int>(type);

m_remaining.remove_prefix(size);

}

else

{

m_current = std::string(m_remaining.substr(0, inOffset));

m_remaining.remove_prefix(inOffset);

auto [type, start, size] = theParser.ParseNextTag(m_remaining);

if (type == TaggedText::Type::error)

{

transitionToError(size, false);

}

else if (!start)

{

transitionToError(size, false);

}

else

m_state += static_cast<int>(type);

m_remaining.remove_prefix(size);

}

Each call to () advances the processing, with special handling for errors and the initial state:

TaggedText TextBlockWithType::operator()()

{

if (m_state & static_cast<int>(TaggedText::Type::initial))

{

m_state -= static_cast<int>(TaggedText::Type::initial);

return { m_current.c_str(), static_cast<int>(TaggedText::Type::text) };

}

else if (m_state & static_cast<int>(TaggedText::Type::error))

{

int currentType = m_state;

m_state -= static_cast<int>(TaggedText::Type::error);

return { m_error.c_str(), currentType };

}

else

{

// We will make the simplifying assumption that we can combine

// b and blockquote and i and blockquote but (1) not i and b

// and (2) not immediately after each other and 3) b and i

// would be within blockquote, not vice versa . These are

// currently true of the data set and can be kept that way.

return handleIntermediateState();

}

handleIntermediateState() uses a stateless parser to determine types and manages the state transitions, as well as returning a block of tagged text when one becomes available.

TaggedText TextBlockWithType::handleIntermediateState()

{

int currentType = m_state;

if (std::size_t off = m_remaining.find("<"); off == std::string::npos)

{

m_current = std::string(m_remaining);

m_state = static_cast<int>(TaggedText::Type::end);

}

else if (m_remaining.substr(off).length() < 3)

{

m_remaining.remove_suffix(m_remaining.length() - off);

m_current = std::string(m_remaining);

m_state = static_cast<int>(TaggedText::Type::end);

}

else

{

m_current = std::string(m_remaining.substr(0, off));

m_remaining.remove_prefix(off);

auto [type, start, length] = theParser.ParseNextTag(m_remaining);

if (type == TaggedText::Type::error)

{

transitionToError(length, false);

m_remaining.remove_prefix(length);

}

else

{

m_state = theParser.CalculateTransition(m_state, type, !start);

if (m_state & static_cast<int>(TaggedText::Type::error))

{

m_error = m_remaining.substr(0, length);

m_remaining.remove_prefix(length);

}

else

{

m_remaining.remove_prefix(length);

}

return { m_current.c_str(), currentType };

}

The TagTransitionParser's ParseNextTag() takes as input a string beginning with a < and passes back a tuple with the tag's length, its type encoded as an enum, and a flag to indicate whether it is an opening or closing tag. If it is unknown, it passes back an error type.

class TagTransitionParser {

public:

std::tuple<TaggedText::Type, bool, int>

ParseNextTag (std::string_view inRest) const;

int CalculateTransition (const int inExisting,

const TaggedText::Type inNew,

const bool inIsEnd) const;

};

std::tuple<TaggedText::Type, bool, int>

TagTransitionParser::ParseNextTag (std::string_view inRest) const

{

if (inRest[0] != '<')

throw std::runtime_error ("Not a tag beginning");

std::size_t offset = inRest.find ('>');

if (offset == std::string::npos)

return std::make_tuple (TaggedText::Type::error, true, 1);

std::string_view tag = inRest.substr (0, offset+1);

switch (tag.length ())

{

case 3:

switch (std::tolower (tag[1]))

{

case 'b':

return std::make_tuple (TaggedText::Type::bold, true, 3);

break;

case 'i':

return std::make_tuple (TaggedText::Type::italic, true, 3);

break;

default:

return std::make_tuple (TaggedText::Type::error, true, 3);

break;

}

break;

case 4:

if (tag[1] != '/')

return std::make_tuple (TaggedText::Type::error, true, 4);

switch (std::tolower (tag[2]))

{

case 'b':

return std::make_tuple (TaggedText::Type::bold, false, 4);

break;

case 'i':

return std::make_tuple (TaggedText::Type::italic, false, 4);

break;

default:

return std::make_tuple (TaggedText::Type::error, false, 4);

break;

}

break;

case 12:

if ((tag == "<blockquote>"sv) || (tag == "<BLOCKQUOTE>"sv))

return std::make_tuple (TaggedText::Type::blockquote, true, 12);

else

return std::make_tuple (TaggedText::Type::error, true, 12);

break;

case 13:

if ((tag == "</blockquote>"sv) || (tag == "</BLOCKQUOTE>"sv))

return std::make_tuple (TaggedText::Type::blockquote, false, 13);

else

return std::make_tuple (TaggedText::Type::error, false, 13);

break;

default:

return std::make_tuple (TaggedText::Type::error, true, tag.length ());

}

CalculateTransition() adjusts an integer containing a bitwise representation of the current state and adjusts the state accordingly. It will also signal error on an unknown or unsupported transition. There is also a bit of special logic to close off bold or italic text when a quote begins; this is technically legal HTML but should never occur in the domain being parsed.

int

TagTransitionParser::CalculateTransition (const int inExisting, const TaggedText::Type inNew,

const bool inIsEnd) const

{

if (inIsEnd && ((inExisting & static_cast<int> (inNew)) == 0))

return inExisting | static_cast<int> (TaggedText::Type::error);

if (!inIsEnd && ((inExisting & static_cast<int> (inNew)) != 0))

return inExisting | static_cast<int> (TaggedText::Type::error);

int rval = inExisting;

if (inNew == TaggedText::Type::blockquote)

{

if (rval & static_cast<int> (TaggedText::Type::bold))

rval -= static_cast<int> (TaggedText::Type::bold);

if (rval & static_cast<int> (TaggedText::Type::italic))

rval -= static_cast<int> (TaggedText::Type::italic);

}

if (inIsEnd)

{

return rval - static_cast<int> (inNew);

}

else

return rval | static_cast<int> (inNew);

}

All this supports a STL std::copy into an output iterator formed around ReviewUpdater.

class ReviewUpdater

{

public:

ReviewUpdater(GtkTextBuffer *inBuffer, GtkTextIter *inIter):

m_buffer(inBuffer), m_iter(inIter)

{ }

void operator()(const TaggedText &inVal)

{

auto [v, t] = inVal;

if (t == 0)

gtk_text_buffer_insert_with_tags_by_name(m_buffer, m_iter, v, -1,

"small", nullptr);

else

{

switch (auto sp = m_trans.GetArgValues(t); sp.size())

{

case 1:

{

std::string s(sp[0]);

gtk_text_buffer_insert_with_tags_by_name(

m_buffer, m_iter, v, -1, "small", s.c_str(), nullptr);

}

break;

case 2:

{

std::string s1(sp[0]);

std::string s2(sp[1]);

gtk_text_buffer_insert_with_tags_by_name(m_buffer, m_iter, v, -1,

"small", s1.c_str(),

s2.c_str(), nullptr);

}

break;

case 3:

{

std::string s1(sp[0]);

std::string s2(sp[1]);

std::string s3(sp[2]);

gtk_text_buffer_insert_with_tags_by_name(

m_buffer, m_iter, v, -1, "small", s1.c_str(), s2.c_str(),

s3.c_str(), nullptr);

}

break;

case 4:

{

std::string s1(sp[0]);

std::string s2(sp[1]);

std::string s3(sp[2]);

std::string s4(sp[2]);

gtk_text_buffer_insert_with_tags_by_name(

m_buffer, m_iter, v, -1, "small", s1.c_str(), s2.c_str(),

s3.c_str(), s4.c_str(), nullptr);

}

break;

}

private:

GtkTextBuffer *m_buffer;

GtkTextIter *m_iter;

TypeTranslator m_trans;

};

ReviewUpdater expands the types returned by TypeTranslator into a set of named values to be passed to GTK.

class TypeTranslator

{

public:

std::span<std::string_view> GetArgValues(const int inEncodedVal);

private:

template <int N>

static std::span<std::string_view>

process(std::array<std::string_view, N> &outArray,

const std::bitset<4> &inSet)

{

int offset = 0;

std::ranges::any_of(std::ranges::iota_view{0, 4},

[&](const int inVal) {

if (inSet.test(inVal))

{

outArray[offset++] = intToStringView(inVal + 1);

if (offset == N)

return true;

}

return false;

});

return { outArray };

}

std::array<std::string_view, 1> m_array1;

std::array<std::string_view, 2> m_array2;

std::array<std::string_view, 3> m_array3;

std::array<std::string_view, 4> m_array4;

static std::string_view intToStringView(const int inVal);

inline static const std::map<int, TaggedText::Type> IntToType{

{ 1, int_to_type(1) },

{ 2, int_to_type(2) },

{ 3, int_to_type(3) },

{ 4, int_to_type(4) }

};

inline static const std::map<TaggedText::Type, std::string> TypeToString{

{ TaggedText::Type::italic, "italic" },

{ TaggedText::Type::bold, "bold" },

{ TaggedText::Type::blockquote, "blockquote" },

{ TaggedText::Type::error, "error" }

};

TypeTranslator passes a span of values -- from 1 to 4 in size -- based in an integer encoded with the formatting information, managing the encoding by copying to a bitset:

std::span<std::string_view>

TypeTranslator::GetArgValues(const int inEncodedVal)

{

std::bitset<4> bits(inEncodedVal);

switch (bits.count())

{

case 1:

std::ranges::any_of(std::ranges::iota_view{0, 4},

[&](const int inVal) {

if (bits.test(inVal))

{

m_array1[0] = intToStringView(inVal + 1);

return true;;

}

else

return false;

});

return std::span(m_array1);

break;

case 2:

return process<2>(m_array2, bits);

case 3:

return process<3>(m_array3, bits);

default:

return process<4>(m_array4, bits);

}

std::string_view TypeTranslator::intToStringView(const int inVal)

{

auto iter = IntToType.find(inVal);

if (iter == IntToType.end())

{

throw std::runtime_error("IntToType missing argument");

}

auto iter2 = TypeToString.find(iter->second);

if (iter2 == TypeToString.end())

{

std::ostringstream str;

str << "TypeToString missing argument: " << inVal << ","

<< static_cast<int>(iter->second);

throw std::runtime_error(str.str());

}

return iter2->second;

}

This model has its limits as a general parsing model -- the largest number of distinct states it could support would be 64, if one moved to an unsigned 64-bit integer as the means of combining the states -- but for the limited number of states in the domain in question it works admirably. The secondary classes are all independently testable, which is a bonus.

Search This Blog

C++ Development: The Breviary Project

LT Project: Parsing Reviews

Comments

Post a Comment

Popular posts from this blog

Boundaries

State Machines

Considerations on an Optimization