split string into max number of pieces (dictionary)

Hi,

i want to build the following algorithm:
Given a string and a table of valid words split the string into max number of pieces such that each piece is a valid word in itself in the best possible way.

for input : thisisawesome
and dictionary: [this, is, a, awe, we, some, awesome]
possible output: [this is a we some, this is awe some, this is awesome]

my approach is to find all words from the dict in the input string and insert the beginning and length in a hash table, matches at the same position are stored in a std::set and ordered by length of the match...

any idea how to improve it?

std::vector<std::string> match_best(const std::set<std::string> &dict, const string &str) {
	std::vector<std::string> words;
	std::unordered_map<int, std::set<int>> ht;
	int i = 0;

	for(auto it = dict.begin(); it != dict.end(); it++) {
		size_t j = -1;

		while((j = str.find(*it, j+1)) != std::string::npos) {
			auto it2 = ht.find(j);

			if(it2 != ht.end()) {
				it2->second.insert(it->length());
				ht.insert(std::make_pair(j, it2->second));
			}
			else {
				std::set<int> s = {static_cast<int>(it->length())};
				ht.insert(std::make_pair(j, s));
			}
		}
	}

	while(i < str.length() && ht.size() > 0) {
		auto it = ht.find(i);

		if(it != ht.end()) {
			// try to use match with min. number of characters
			words.push_back(str.substr(it->first, *it->second.begin()));
			i += *it->second.begin();

            // remove the value from the std::set
			it->second.erase(it->second.begin());
			// if std::set is empty -> delete key from hash table
			if(it->second.size() == 0) {
				ht.erase(it->first);
			}
		}
		else {
			if(words.size() == 0) {
				break;
			}

			// remove prev. match if no match for the rest string
			i -= words.back().size();
			words.pop_back();
		}
	}

	return words;
}

int main() {
	// your code goes here

	std::set<std::string> dict1 = {"this", "is", "a", "awe", "we", "some", "awesome"};
	std::vector<std::string> words = match_best(dict1, "thisisawesomesomea");
	for_each(words.begin(), words.end(), [](std::string &s){ cout << s << " "; });

	cout << '\n';

	std::set<std::string> dict2 = {"this", "is", "a", "awe", "we", "awesome"};
	words = match_best(dict2, "thisisawesome");
	for_each(words.begin(), words.end(), [](std::string &s){ cout << s << " "; });

	cout << '\n';

	std::set<std::string> dict3 = {"this", "is", "awe", "we", "some", "awesome"};
	words = match_best(dict3, "thisisawesome");
	for_each(words.begin(), words.end(), [](std::string &s){ cout << s << " "; });

    cout << '\n';

	std::set<std::string> dict4 = {"this", "is", "awe", "we", "thisisawesome"};
	words = match_best(dict4, "thisisawesome");
	for_each(words.begin(), words.end(), [](std::string &s){ cout << s << " "; });

    cout << '\n';

	std::set<std::string> dict5 = {"is", "awe", "we", "a"};
	words = match_best(dict5, "thisisawesome");
	for_each(words.begin(), words.end(), [](std::string &s){ cout << s << " "; });

	return 0;
}

minastaros

Yeah, on a first glimpse: make the printing for_each a function on its own
(I know you wanted something on the matching approach -- have to get the whole idea first...)

Werner Salomon

febro schrieb:

i want to build the following algorithm:
Given a string and a table of valid words split the string into max number of pieces such that each piece is a valid word in itself in the best possible way.

for input : thisisawesome
and dictionary: [this, is, a, awe, we, some, awesome]
possible output: [this is a we some, this is awe some, this is awesome]

my approach is to find all words from the dict in the input string and insert the beginning and length in a hash table, matches at the same position are stored in a std::set and ordered by length of the match...

any idea how to improve it?

Hi febro,

don't know whether my suggestion improve the algorithm. I'm not even sure, whether it fits Your requirement exactly.
The idea is to run over the input, character for character, and look which word still match. Founding a word save its iterator to ' result ' - the longest fitting word wins.

#include <iostream>
#include <iterator> // ostream_iterator<>
#include <string>
#include <vector>
#include <set>

template< typename C, typename I >
typename C::const_iterator get_word( const C& words, I& from, I to )
{
    using namespace std;
    vector< bool > ok( words.size(), true );
    typename C::const_iterator result = words.end();
    for( size_t column = 0;; ++column, ++from )
    {
        bool found_something = false;
        vector< bool >::iterator on_off = ok.begin();
        for( typename C::const_iterator i_word = words.begin(); i_word != words.end(); ++i_word, ++on_off )
        {
            if( !*on_off )
                continue;
            if( column == i_word->length() )
            {
                result = i_word; // word found; save position
                *on_off = false;
            }
            else if( from == to || (*i_word)[column] != *from )
                *on_off = false; // mismatch
            else
                found_something = true;
        }
        if( !found_something || from == to )
            break;
    }
    return result;
}

template< typename C, typename I >
std::vector< std::string > match_best( const C& dict, I from, I to )
{
    std::vector< std::string > result;
    for( typename C::const_iterator i
        ; (i = get_word( dict, from, to )) != dict.end(); ) // stops, if no matching word was found
        result.push_back( *i );
    return result;
}

int main() 
{
    using namespace std;
    const char* dict1_[] = {"this", "is", "a", "awe", "we", "some", "awesome"}; 
    set< string > dict1( dict1_, dict1_ + sizeof(dict1_)/sizeof(*dict1_) );
    const string in = "thisisawesomesomea";
    vector< string > words = match_best( dict1, in.begin(), in.end() );
    copy( words.begin(), words.end(), ostream_iterator< string >( cout << "> ", " " ) );
    cout << endl;
    return 0;
}

best regards
Werner

Assuming that you are only interested in the split with the most pieces, here is a algorithm in potentially O(m*n) (m=Dictionary length, n=string length) after you replace the slow std::search with a better algorithm:

using std::begin;
using std::endl;

// Change to Boost's boyer_moore_horspool if you need a better complexity
template <typename HaystackIter, typename Needle, typename Output>
void search_all(HaystackIter first, HaystackIter last, Needle &needle,
                Output out) {
  for (HaystackIter it = first;
       (it = std::search(it, last, begin(needle), end(needle))) != last;
       it += needle.size())
    *out++ = std::make_pair(it, &needle);
}

template <typename C, typename I, typename Value = typename C::value_type>
std::vector<Value> match_best(const C &dict, I from, I to) {
  std::vector<std::pair<I, Value const *> > events;
  for (auto &d : dict)
    search_all(from, to, d, std::back_inserter(events));
  std::sort(begin(events), end(events));
  std::vector<std::pair<int, Value const *> > dp(
      to - from + 1, std::make_pair(std::numeric_limits<int>::min(), nullptr));
  dp[0].first = 0;
  for (auto &e : events) {
    auto &old_res = dp[e.first - from + e.second->size()];
    auto new_res = std::make_pair(dp[e.first - from].first + 1, e.second);
    old_res = std::max(old_res, new_res);
  }
  std::vector<Value> result(dp.back().first);
  int res_idx = result.size() - 1;
  for (size_t i = dp.size() - 1; i; i = i - dp[i].second->size())
    result[res_idx--] = *dp[i].second;
  return result;
}

int main() {
  using namespace std;
  vector<string> dict1{ "this", "is", "a", "awe", "we", "some", "awesome" };
  const string in = "thisisawesomesomea";
  vector<string> words = match_best(dict1, begin(in), end(in));
  copy(begin(words), end(words), ostream_iterator<string>(cout << "> ", " "));
  cout << endl;
}

Output:

> this is a we some some a

Note: change lines 28 - 31 to

std::vector<Value> result(std::max(0,dp.back().first));
  int res_idx = result.size() - 1;
  for (size_t i = dp.size() - 1; res_idx >= 0; i = i - dp[i].second->size())
    result[res_idx--] = *dp[i].second;

if you don't want to crash my program if a valid splitting is impossible.

Also, add

std::vector<std::string> match_best(std::set<std::string> const& dict,
                                    std::string const& in)
{
  return match_best(dict, begin(in) end(in));
}

so that you can use my function within your main.

what is the complexity of you algorithm?

how about a trie algorithm?

#define ALPHABET_SIZE 26

typedef struct node {
	bool found;
	bool visited;
	node *link[ALPHABET_SIZE];
}node;

int LETTER(char x) {
	if(islower(x)) {
		return static_cast<int>(x) - static_cast<int>('a');
	}
	return -1;
}

node *insert (node *root, char *string) {
	if (root == NULL) {
		root = new node();
		root->visited = false;
		root->found = false;

		for(int i = 0; i < ALPHABET_SIZE; i++) {
			root->link[i] = NULL;
		}
	}
	if (*string == '\0') {
		root->found = true;
	}
	else {
		root->link[LETTER(string[0])] = insert(root->link[LETTER(string[0])], string+1);
	}

	return root;
}

void buildDictionary (node **root, std::vector<std::string> &dict) {
	*root = NULL;

	for (auto it = dict.begin(); it != dict.end(); ++it) {
		*root = insert(*root, (char *) it->c_str());
	}
}

int find (node *root, std::string str, int index, std::vector<std::string> &words) {
	if ((root == NULL) || (index == str.size() + 1)) {
		return 0;
	}

	if (!root->visited && root->found) {
		words.push_back(str);
		words.back().resize(index);
		root->visited = true;
		return index;
	}
	else {
		return find (root->link[LETTER(str[index])], str, index+1, words);
	}
}

void cleanUp (node *root) {
	if (root == NULL) {
		return;
	}

	for (int i = 0; i < ALPHABET_SIZE; i++) {
		cleanUp(root->link[i]);
	}

	delete root;
}

int main() {
	// your code goes here

	node *root = NULL;
	std::vector<std::string> dict = {"this", "is", "a", "awe", "we", "some", "awesome"}, words;
	std::string test = "thisisawesome";
	buildDictionary (&root, dict);
	int index = 0;

	while(index < test.length()) {
		int val = find(root, &test[index], 0, words);
		if(val > 0) {
			index += val;
		}
		else {
			if(words.size() == 0) {
				break;
			}
			index -= words.back().size();
			words.pop_back();
		}
	}

	for (auto it = words.begin(); it != words.end(); ++it) {
		std::cout << *it << std::endl;
	}

	cleanUp(root);

	return 0;
}

My algorithm runs in O(m*n*k) if there are m small strings with length of at most k and the big string to be splitted has length n. However, this can be easily converted to a O(m*(n+k)) = O(m*n) algorithm.

I tend to avoid tries as they usually exceed the memory limit by far and are in practice much slower even if they restrict the input to only lowercase letters.

Furthermore, you cannot use them here. Greedy approaches do not work. For example:

std::vector<std::string> dict = {"ab","c","d","e","f","g","a","bcdefg"}, words;
  std::string test = "abcdefg";

Produces

a
bcdefg

with your code, but

ab c d e f g

using my algorithm.

@i am awesome!
your algorithm fails for the following dictionary: dict1{ "this", "is", "a", "awe", "we", "awesome" };

@febro: What would the result be?

std::vector<std::string> dict = {"this", "is", "a", "awe", "we", "awesome"}, words;
    std::string test = "thisisawesome";

output should be:
this
is
awesome

And whats exactly the problem?