wordcount mit string

Habe auch noch eine Lösung gemacht:

#include <algorithm>
#include <bitset>
#include <iostream>
#include <string>
#include <vector>
using namespace std;

void split( const string& text, vector<string>& words )
{
	bitset<255> separator_lookup;
	separator_lookup.flip();

	const char* alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

	for(; *alphabet; ++alphabet)
	{
		separator_lookup[ *alphabet ] = false;
	}

	string::const_iterator start;
	bool word = false;

	for(string::const_iterator it=text.begin(); it!=text.end(); ++it)
	{
		if( separator_lookup[ *it ] )
		{
			if( word )
			{
				words.push_back( string(start,it) );
				word = false;
			}
		}
		else if( !word )
		{
			start = it;
			word = true;
		}
	}

	if( word )
		words.push_back( string(start,text.end()) );
}

unsigned wordcount( const string& text )
{
	vector<string> words;
	split( text, words );
	return words.size();
}

int main()
{
	string text = "   Hallo Welt, Du grausame    Welt , hast Tabs und auch Zeilen sowie      Steuerzeichen Sie haben 12 Woerter eingegeben   ";

	cout << wordcount( text ) << '\n';
}

Ethon

std::array<bool, N> statt std::bitset<N> - Stackspeicher muss sowieso nicht gespart werden und durch das Bits nudeln geht der Performance-Vorteil flöten.

Ethon schrieb:

std::array<bool, N> statt std::bitset<N> - Stackspeicher muss sowieso nicht gespart werden und durch das Bits nudeln geht der Performance-Vorteil flöten.

:p
Wenns nun auch um Performance geht... Man hätte natürlich auch einen einfachen Zähler statt std::vector nehmen können. Aber irgendwann kommt wer, und will neben der Anzahl auch noch die einzelnen Wörter haben.:p

Ethon schrieb:

std::array<bool, N> statt std::bitset<N> - Stackspeicher muss sowieso nicht gespart werden und durch das Bits nudeln geht der Performance-Vorteil flöten.

Habs gerade mal getestet. Du hast Recht. std::array<bool,N> ist sogar deutlich schneller.

#include <algorithm>
#include <bitset>
#include <iostream>
#include <string>
#include <vector>
#include <array>
#include <cstdlib>
#include <boost/timer.hpp>
using namespace std;

unsigned wordcount_bitset( const string& text )
{
    bitset<255> separator_lookup;
    separator_lookup.flip();

    const char* alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

    for(; *alphabet; ++alphabet)
    {
        separator_lookup[ *alphabet ] = false;
    }

    string::const_iterator start;
    bool word = false;
	unsigned counter = 0;

    for(string::const_iterator it=text.begin(); it!=text.end(); ++it)
    {
        if( separator_lookup[ *it ] )
        {
            if( word )
            {
				++counter;
                word = false;
            }
        }
        else if( !word )
        {
            start = it;
            word = true;
        }
    }

    if( word )
		++counter;

	return counter;
}

unsigned wordcount_std_array( const string& text )
{
    std::array<bool,255> separator_lookup;
	separator_lookup.fill(true);

    const char* alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

    for(; *alphabet; ++alphabet)
    {
        separator_lookup[ *alphabet ] = false;
    }

    string::const_iterator start;
    bool word = false;
	unsigned counter = 0;

    for(string::const_iterator it=text.begin(); it!=text.end(); ++it)
    {
        if( separator_lookup[ *it ] )
        {
            if( word )
            {
				++counter;
                word = false;
            }
        }
        else if( !word )
        {
            start = it;
            word = true;
        }
    }

    if( word )
		++counter;

	return counter;
}

int main()
{
	srand( time(NULL) );

	const unsigned SIZE = 1000000000; // 1 Mrd.
	string text( SIZE, 'x' );

	for(unsigned i=0; i!=SIZE/100; ++i)
	{
		if( i%100 )
			text[i] = ' ';
	}

	boost::timer timer;

	// wordcount_bitset
	timer.restart();
	unsigned words = wordcount_bitset( text );
	cout << "wordcount_bitset: " << timer.elapsed() << "s  |  Words: " << words << '\n'; // 6,9 Sekunden

	// wordcount_std_array
	timer.restart();
	words = wordcount_std_array( text );
	cout << "wordcount_std_array: " << timer.elapsed() << "s  |  Words: " << words << '\n'; // 1,4 Sekunden
}

Huch, das srand hat da drin natürlich nichts mehr verloren.

Hab das Ganze eben noch mit vector<bool> und vector<char> getestet:

std::vector<bool>: 2,8 Sekunden
std::vector<char>: 1,4 Sekunden

array<bool,255> und vector<char>( 255, true ) sind gleich schnell.

Sone

Und mit STD_VECTOR_BOOL_NOT_SPECIAL ?

Sone schrieb:

Und mit STD_VECTOR_BOOL_NOT_SPECIAL ?

Beides 2,8 Sekunden VS11.

Nexus

Das Proposal zur Entfernung oder Deprecation von std::vector<bool> wurde nicht umgesetzt. Der Standard wurde sogar erweitert, um std::hash zu spezialisieren.

Ethon

out schrieb:

Hab das Ganze eben noch mit vector<bool> und vector<char> getestet:

std::vector<bool>: 2,8 Sekunden
std::vector<char>: 1,4 Sekunden

array<bool,255> und vector<char>( 255, true ) sind gleich schnell.

Ist klar, die paar nano/mikro Sekunden die das malloc dauert machen bei 1 Milliarden Zeichen das Kraut auch nicht fett.