wordcount mit string

Ich wag ja kaum, eine Lösung fast ohne Hilfenahme der stl zu präsentieren, aber was sagt ihr hierzu?

template<class Forward1, class Forward2> inline
Forward1 find_first_notof(Forward1 first1, Forward1 last1,
						  Forward2 first2, Forward2 last2) 
{
	for (; first1 != last1; ++first1)
	{       
		Forward2 mid2;
		for (mid2 = first2; mid2 != last2; ++mid2)
			if (*first1 == *mid2)
				break;
		if(mid2==last2)
			return first1;
	}
	return first1;
}

size_t Count(const string& org, const string& separators)
{
	size_t c = 0;
	string::const_iterator en, be=org.begin();
	do
	{
		en=find_first_of(be, org.end(),separators.begin(), separators.end());
		if(be!=en)
			++c;
		be=find_first_notof(en, org.end(), separators.begin(), separators.end());
	} while(be!=org.end());
	return c;
}

int wordcount_fricker(string satz)
{
	return Count(satz," \n\t -;,.:?!:"); // seperators hinzufügen
}

[quote="SeppJ"]

Werner Salomon schrieb:

Ich werfe als Herausforderung std::set_symmetric_difference in den Raum.

Wozu? Es gibt noch genug einfache Lösungen

std::string tmp = ' '+s+' ';
return (std::unique(tmp.begin(), tmp.end(),
                    [](char c, char d){return isspace(c)!=isspace(d);})
        - tmp.begin() + 1)/2 << '\n';

Ethon

Euch zuliebe hab ich die count-Lösung mal angepasst:

int wordcount_Ethon(string satz)
{
    return count_if(satz.begin(), satz.end(), [&](char& c) { return &c == satz.c_str() ? !isspace(c) : isalnum(c) && isspace(*(&c - 1)); });
}

Ethon schrieb:

Euch zuliebe hab ich die count-Lösung mal angepasst:

Bitte ohne UB.

Ethon

fummler schrieb:

Ethon schrieb:

Euch zuliebe hab ich die count-Lösung mal angepasst:

Bitte ohne UB.

Seit C++11 doch kein UB mehr? Was stört?

Und selbst wenn : Geht doch eh nur ums frickeln.

Habe auch noch eine Lösung gemacht:

#include <algorithm>
#include <bitset>
#include <iostream>
#include <string>
#include <vector>
using namespace std;

void split( const string& text, vector<string>& words )
{
	bitset<255> separator_lookup;
	separator_lookup.flip();

	const char* alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

	for(; *alphabet; ++alphabet)
	{
		separator_lookup[ *alphabet ] = false;
	}

	string::const_iterator start;
	bool word = false;

	for(string::const_iterator it=text.begin(); it!=text.end(); ++it)
	{
		if( separator_lookup[ *it ] )
		{
			if( word )
			{
				words.push_back( string(start,it) );
				word = false;
			}
		}
		else if( !word )
		{
			start = it;
			word = true;
		}
	}

	if( word )
		words.push_back( string(start,text.end()) );
}

unsigned wordcount( const string& text )
{
	vector<string> words;
	split( text, words );
	return words.size();
}

int main()
{
	string text = "   Hallo Welt, Du grausame    Welt , hast Tabs und auch Zeilen sowie      Steuerzeichen Sie haben 12 Woerter eingegeben   ";

	cout << wordcount( text ) << '\n';
}

Ethon

std::array<bool, N> statt std::bitset<N> - Stackspeicher muss sowieso nicht gespart werden und durch das Bits nudeln geht der Performance-Vorteil flöten.

Ethon schrieb:

std::array<bool, N> statt std::bitset<N> - Stackspeicher muss sowieso nicht gespart werden und durch das Bits nudeln geht der Performance-Vorteil flöten.

:p
Wenns nun auch um Performance geht... Man hätte natürlich auch einen einfachen Zähler statt std::vector nehmen können. Aber irgendwann kommt wer, und will neben der Anzahl auch noch die einzelnen Wörter haben.:p

Ethon schrieb:

std::array<bool, N> statt std::bitset<N> - Stackspeicher muss sowieso nicht gespart werden und durch das Bits nudeln geht der Performance-Vorteil flöten.

Habs gerade mal getestet. Du hast Recht. std::array<bool,N> ist sogar deutlich schneller.

#include <algorithm>
#include <bitset>
#include <iostream>
#include <string>
#include <vector>
#include <array>
#include <cstdlib>
#include <boost/timer.hpp>
using namespace std;

unsigned wordcount_bitset( const string& text )
{
    bitset<255> separator_lookup;
    separator_lookup.flip();

    const char* alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

    for(; *alphabet; ++alphabet)
    {
        separator_lookup[ *alphabet ] = false;
    }

    string::const_iterator start;
    bool word = false;
	unsigned counter = 0;

    for(string::const_iterator it=text.begin(); it!=text.end(); ++it)
    {
        if( separator_lookup[ *it ] )
        {
            if( word )
            {
				++counter;
                word = false;
            }
        }
        else if( !word )
        {
            start = it;
            word = true;
        }
    }

    if( word )
		++counter;

	return counter;
}

unsigned wordcount_std_array( const string& text )
{
    std::array<bool,255> separator_lookup;
	separator_lookup.fill(true);

    const char* alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

    for(; *alphabet; ++alphabet)
    {
        separator_lookup[ *alphabet ] = false;
    }

    string::const_iterator start;
    bool word = false;
	unsigned counter = 0;

    for(string::const_iterator it=text.begin(); it!=text.end(); ++it)
    {
        if( separator_lookup[ *it ] )
        {
            if( word )
            {
				++counter;
                word = false;
            }
        }
        else if( !word )
        {
            start = it;
            word = true;
        }
    }

    if( word )
		++counter;

	return counter;
}

int main()
{
	srand( time(NULL) );

	const unsigned SIZE = 1000000000; // 1 Mrd.
	string text( SIZE, 'x' );

	for(unsigned i=0; i!=SIZE/100; ++i)
	{
		if( i%100 )
			text[i] = ' ';
	}

	boost::timer timer;

	// wordcount_bitset
	timer.restart();
	unsigned words = wordcount_bitset( text );
	cout << "wordcount_bitset: " << timer.elapsed() << "s  |  Words: " << words << '\n'; // 6,9 Sekunden

	// wordcount_std_array
	timer.restart();
	words = wordcount_std_array( text );
	cout << "wordcount_std_array: " << timer.elapsed() << "s  |  Words: " << words << '\n'; // 1,4 Sekunden
}

Huch, das srand hat da drin natürlich nichts mehr verloren.

Hab das Ganze eben noch mit vector<bool> und vector<char> getestet:

std::vector<bool>: 2,8 Sekunden
std::vector<char>: 1,4 Sekunden

array<bool,255> und vector<char>( 255, true ) sind gleich schnell.

Sone

Und mit STD_VECTOR_BOOL_NOT_SPECIAL ?

Sone schrieb:

Und mit STD_VECTOR_BOOL_NOT_SPECIAL ?

Beides 2,8 Sekunden VS11.

Nexus

Das Proposal zur Entfernung oder Deprecation von std::vector<bool> wurde nicht umgesetzt. Der Standard wurde sogar erweitert, um std::hash zu spezialisieren.

Ethon

out schrieb:

Hab das Ganze eben noch mit vector<bool> und vector<char> getestet:

std::vector<bool>: 2,8 Sekunden
std::vector<char>: 1,4 Sekunden

array<bool,255> und vector<char>( 255, true ) sind gleich schnell.

Ist klar, die paar nano/mikro Sekunden die das malloc dauert machen bei 1 Milliarden Zeichen das Kraut auch nicht fett.