strtool.hpp

Go to the documentation of this file.
00001 #ifndef s11n_net_s11n_STRINGTOOL_HPP_INCLUDED
00002 #define s11n_net_s11n_STRINGTOOL_HPP_INCLUDED 1
00003 
00004 #include <string>
00005 #include <map>
00006 #include <locale>
00007 #include <iostream>
00008 #include <sstream>
00009 #include <queue>
00010 namespace s11n { namespace io {
00011 /**
00012 The strtool namespace encapsulates a set of utility functions for
00013 working with string objects. This mini-lib has unfortunately followed
00014 me from source tree to source tree like a little virus. While i have
00015 no special love for this code, it has proven useful time and time again.
00016 */
00017 namespace strtool {
00018 
00019         /**
00020            The functions in the Private namespace should not be used
00021            by client code.
00022         */
00023         namespace STPrivate
00024         {
00025 
00026 
00027                 /**
00028                    Lexically casts str to a value_type, returning
00029                    errorVal if the conversion fails.
00030 
00031                    TODO: implement the following suggestion from  
00032                    Kai Unger <kai.unger@hacon.de> (21 Sept 2004):
00033 
00034                    When the cast is done, you should check if there
00035                    are unread characters left. For example, casting
00036                    "1.2this_definitly_is_not_a_number" to double will
00037                    not result in returning the error value, because
00038                    conversion of "1.2" to 1.2d succeeds and the rest
00039                    of the string is ignored.
00040                 */
00041                 template <typename value_type>
00042                 value_type from_string( const std::string & str, const value_type & errorVal ) throw()
00043                 {
00044                         std::istringstream is( str );
00045                         if ( !is )
00046                                 return errorVal;
00047                         value_type foo = value_type();
00048                         if ( is >> foo )
00049                                 return foo;
00050                         return errorVal;
00051                 }
00052 
00053                 /**
00054                    Returns a string representation of the given
00055                    object, which must be ostreamble.
00056                 */
00057                 template <typename value_type>
00058                 std::string to_string( const value_type & obj ) throw()
00059                 {
00060                         std::ostringstream os;
00061                         // os << std::fixed;
00062                         os << obj;
00063                         return os.str();
00064                 }
00065 
00066 //                 inline std::string to_string( double d ) throw()
00067 //                 {
00068 //                         std::ostringstream os;
00069 //                         os << std::fixed << d;
00070 //                         return os.str();
00071 //                 }
00072 
00073                 /**
00074                    Convenience/efficiency overload.
00075                 */
00076                 inline std::string from_string( const std::string & str, const std::string & /*errorVal*/ ) throw()
00077                 {
00078                         return str;
00079                 }
00080 
00081                 /**
00082                    Convenience/efficiency overload.
00083                 */
00084                 inline std::string from_string( const char *str, const char * /*errorVal*/ ) throw()
00085                 {
00086                         return str;
00087                 }
00088 
00089                 /**
00090                    Convenience/efficiency overload.
00091                 */
00092                 inline std::string to_string( const char *obj ) throw()
00093                 {
00094                         return obj ? obj : "";
00095                 }
00096 
00097                 /**
00098                    Convenience/efficiency overload.
00099                 */
00100                 inline std::string to_string( const std::string & obj ) throw()
00101                 {
00102                         return obj;
00103                 }
00104 
00105 
00106         } // end STPrivate namespace
00107 
00108         /**
00109            Convenience typedef for use with translate_entities().
00110          */
00111         typedef std::map<std::string,std::string> entity_map;
00112 
00113         /**
00114            For each entry in the input string, the characters are
00115            mapped to string sequences using the given
00116            translation_map. Where no mappings exist, the input
00117            sequence is left as-is.
00118 
00119            It returns the number of translations made.
00120 
00121            If reverse_translation == true then a reverse mapping is
00122            done: map values are treated as keys.
00123 
00124            This is useful, for example, for doing XML-entity-to-char
00125            conversions.
00126 
00127        Complexity is essentially linear, based on a combination of
00128        buffer.size() and translation_map.size(). Best used with
00129        small maps on short strings! The speed can be increased
00130        signifcantly, but probably only if we restrict keys and
00131        values to 1 character each.
00132 
00133        Design note: this really should be a function template,
00134        accepting any lexically-castable key/val types, but the
00135        function is quite long, and therefore not really suitable
00136        for inclusion in the header.
00137         */
00138         std::size_t translate_entities( std::string & buffer, const entity_map & translation_map, bool reverse_translation = false );
00139 
00140 
00141         /**
00142            A policy enum used by trim_string().
00143         */
00144         enum TrimPolicy {
00145         /**
00146            Trim only leading spaces.
00147          */
00148         TrimLeading = 0x01,
00149         /**
00150            Trim only trailing spaces.
00151          */
00152         TrimTrailing = 0x02,
00153         /**
00154            Trim leading and trailing spaces.
00155          */
00156         TrimAll = TrimLeading | TrimTrailing
00157         };
00158 
00159         /**
00160            Trims leading and trailing whitespace from the input string
00161            and returns the number of whitespace characters removed.
00162          */
00163         std::size_t trim_string( std::string &, TrimPolicy = TrimAll );
00164         /**
00165            Trims leading and trailing whitespace from the input string
00166            and returns the trimmed string.
00167          */
00168         std::string trim_string( const std::string &, TrimPolicy = TrimAll );
00169 
00170 
00171         /**
00172            Attempts to remove all backslash-escaped chars from str.
00173 
00174            Removes backslash-escaped newlines from the input string, including
00175            any whitespace immediately following each backslash.
00176 
00177            The optional slash parameter defines the escape character.
00178         */
00179         std::size_t strip_slashes( std::string &str, const char slash = '\\' );
00180 
00181         /**
00182            Adds an escape sequence in front of any characters in
00183            instring which are also in the list of chars_to_escape.
00184            Returns the number of escapes added.
00185 
00186            e.g., to escape (with a single backslash) all $, % and \ in
00187            mystring with a backslash:
00188 
00189            <pre>
00190            escape_string( mystring, "$%\\", "\\" );
00191            </pre>
00192 
00193            (WARNING: the doxygen-generated HTML version of these docs
00194            may incorrectly show single backslashes in the above example!)
00195 
00196 
00197         */
00198         std::size_t escape_string( std::string & instring, const std::string & chars_to_escape, const std::string & escape_seq = "\\" );
00199 
00200         /**
00201            normalize_string() is like trim_string() and
00202            strip_slashes(), combined, plus it removes leading/trailing
00203            quotes:
00204 
00205            <pre>
00206            "this is a \
00207            sample multi-line, backslash-escaped \
00208            string."
00209            </pre>
00210            
00211            Will translate to:
00212            <pre>
00213            this is a sample multi-line, backslash-escaped string.
00214            </pre>
00215         */
00216         void normalize_string( std::string & );
00217 
00218 
00219         /**
00220            Returns the first whitespace-delimited token from the given
00221            string, or an empty string if there is no such token.
00222         */
00223         std::string first_token( const std::string & );
00224 
00225         /**
00226            Returns the passed-in string, minus the first
00227            whitespace-delimited token. An empty string is returned if
00228            there is no second token.
00229          */
00230         std::string after_first_token( const std::string & );
00231 
00232 
00233 
00234         /**
00235            Returns int values for chars '0'-'9', 'a'-'f' and 'A'-'F',
00236            else -1.
00237         */
00238         int int4hexchar( char character );
00239 
00240         /**
00241            Returns decimal value of wd, which is assumed to be a
00242            hex-encoded number. wd may optionally be prefixed with '#',
00243            as in \#ff00ff. Case is insignificant.
00244 
00245            On error -1 is returned, but -1 is also potentially a valid
00246            number, so there is really no way of knowing if it fails or
00247            not. :/
00248         */
00249         int hex2int( const std::string & wd );
00250 
00251 
00252         /**
00253            Lexically casts v to a string.
00254         */
00255         template <typename ValueT>
00256         std::string to( const ValueT & v )
00257         {
00258         return STPrivate::to_string(v);
00259         }
00260 
00261         /**
00262            Lexically casts v to a ValueT, or returns dflt if
00263            conversion fails.
00264         */
00265         template <typename ValueT>
00266         ValueT from( const std::string & v, const ValueT & dflt = ValueT() )
00267         {
00268                 return STPrivate::from_string( v, dflt );
00269         }
00270 
00271 
00272         /**
00273            See translate_entities() for details.
00274         */
00275         typedef std::map<std::string,std::string> entity_map;
00276 
00277 
00278     /**
00279        YAGNI!
00280 
00281        A functor for translating entities in a set of strings.
00282        Designed for use with std::for_each().
00283     */
00284         struct entity_translator
00285         {
00286         /**
00287            Sets the map and reverse options to be used from
00288            calls to operator().
00289         */
00290                 entity_translator( const entity_map & map, bool reverse )
00291             : m_map(&map),m_rev(reverse)
00292                 {
00293                 }
00294 
00295         /**
00296            Calls translate_entities( str, MAP, REVERSE ),
00297            where MAP and REVERSE are the flags set via the
00298            ctor.
00299         */
00300                 inline void operator()( std::string & str ) const
00301                 {
00302                         translate_entities( str, *(this->m_map), this->m_rev );
00303                 }
00304         private:
00305                 const entity_map * m_map;
00306                 bool m_rev;
00307                          
00308         };
00309 
00310         /**
00311            Internal-use initializer for setting up an entity
00312            translation map for default quote-escaping behaviour.
00313         */
00314         struct default_escapes_initializer
00315         {
00316         /**
00317            Adds the following escape sequences to map:
00318 
00319            - 1x backslash (\) == 2x backslash.
00320 
00321            - 1x apostrophe  == 1x backslash 1x apostrophe
00322 
00323            - 1x double-quote  == 1x backslash 1x double-quote  
00324         */
00325                 void operator()( entity_map & map );
00326         };
00327 
00328 
00329         /** Internal marker type. */
00330         template <typename ContextT> struct strtool_sharing_context {};
00331 
00332         /**
00333            Returns the default entity translation map, which can be used to
00334            [un]slash-escape the folling entities: '\\', '\'', '"'.
00335         */
00336         const entity_map & default_escapes_translations();
00337 
00338         /**
00339            Converts v to a string, applies translate_entities(...,trans,reverse ),
00340            and returns the resulting string.
00341         */
00342         template <typename ValueT>
00343         std::string translate( const ValueT & v,
00344                                const entity_map & trans,
00345                                bool reverse )
00346         {
00347                 std::string val = to( v );
00348                 translate_entities( val, trans, reverse );
00349                 return val;
00350         }
00351 
00352 
00353         /**
00354            Calls translate( v,trans, false);
00355         */
00356         template <typename ValueT>
00357         std::string escape( const ValueT & v, const entity_map & trans = default_escapes_translations() )
00358         {
00359                 return translate( v, trans, false );
00360         }
00361 
00362 
00363         /**
00364            Calls translate( v, trans, true );
00365         */
00366         template <typename ValueT>
00367         std::string unescape( const ValueT & v, const entity_map & trans = default_escapes_translations() )
00368         {
00369                 return translate( v, trans, true );
00370         }
00371 
00372         /**
00373            Returns v as a quoted string, using the given quote
00374            character.
00375         */
00376         template <typename ValueT>
00377         std::string quote( const ValueT & v, const std::string & quote = "\'" )
00378         {
00379                 return quote + to( v ) + quote;
00380         }
00381 
00382         /**
00383            Exactly like expand_dollar_refs_inline() but returns a new string
00384            which results from the expansions. The returned string may
00385            be the same as the original.
00386  
00387          */
00388         std::string expand_dollar_refs( const std::string & text, const entity_map & src );
00389 
00390         /**
00391            Parsed env vars out of buffer, replacing them with their
00392            values, as defined in the src map. Accepts variables
00393            in the format ${VAR} and $VAR.
00394 
00395            e.g., ${foo} corresponds to the value set in src["foo"].
00396 
00397            Referencing a variable which is not set does not
00398            expand the variable to an empty value: it is left
00399            as-is. Thus expanding ${FOO} when "FOO" is not set
00400            will result in "${FOO}".
00401 
00402            To get a dollar sign into the resulting string, escape
00403            it with a single backslash: this keeps it from being
00404            parsed as a ${variable}.
00405 
00406        Returns the number of variables expanded.
00407 
00408        Note that this function is much *more* efficient than using
00409        translate_entities() to perform a similar operation.
00410        Because of it's stricter format we can do a single pass
00411        through the string and may not even have to reference the
00412        source map.
00413 
00414        Complexity depends on the number of ${vars} parts are expanded
00415        in buffer: overall runtime depends on buffer length,
00416        plus a non-determinate amount of time per ${var} expanded.
00417 
00418        Design note: this really should be a function template,
00419        accepting any lexically-castable key/val types, but the
00420        function is quite long, and therefore not really suitable
00421        to inclusion in the header.
00422 
00423 
00424        Known misgivings:
00425 
00426        - When buffer contains dollar signs which are preceeded by
00427        a slash, the slash is stripped even if the $ does not
00428        expand to anything. This is arguably behaviour.
00429         */
00430         std::size_t expand_dollar_refs_inline( std::string & buffer, const entity_map & src );
00431 
00432 
00433         /**
00434            string_tokenizer is a... well, a string tokenizer, modelled after
00435            Java's java.util.string_tokenizer class.
00436 
00437            This code used to be part of the KDE 1.x libraries: (named StringTokenizer)
00438 
00439            Copyright (C) 1997 Martin Jones (mjones@kde.org),
00440            (C) 1997 Torben Weis (weis@kde.org), and
00441            (C) 1998 Waldo Bastian (bastian@kde.org)
00442            
00443            Then this code was part of the QUB project:
00444 
00445            Copyright (C) 2000-2003 stephan beal (sgbeal@users.sourceforge.net)
00446            and Rusty Ballinger (bozo@users.sourceforge.net)
00447            
00448            THIS code is part of the s11n project, and is maintained by
00449            stephan@s11n.net. i have been graciously granted explicit
00450            permission from the three original authors to release this
00451            code into the Public Domain, and this copy falls under that
00452            "license." (The original license was GNU GPL.)
00453         */
00454 
00455         class string_tokenizer
00456         {
00457         public:
00458                 string_tokenizer();
00459                 ~string_tokenizer();
00460 
00461                 /**
00462                    Sets the token list and separator to be used by
00463                    subsequent next_token() calls.
00464 
00465                    It is important that the strings not be
00466                    destroyed/freed by the client before this object is
00467                    done with them. That is, do not call tokenize(),
00468                    then free the strings, then call has_tokens() or
00469                    next_token(). (In practice, this has never happened.)
00470                  */
00471                 void tokenize( const char * sequence, const char * separator );
00472 
00473                 /**
00474                    Returns the next token in the list. Results are
00475                    undefined if this method is called when
00476                    has_tokens() returns false.
00477                 */
00478                 const char* next_token();
00479 
00480                 /**
00481                    Returns true if this object has another token to
00482                    return via next_token().
00483                  */
00484                 bool has_tokens();
00485 
00486         private:
00487                 char *pos;
00488                 char *end;
00489                 char *buffer;
00490                 int  bufLen;
00491         };
00492 
00493 
00494 
00495     /**
00496            stdstring_tokenizer:
00497 
00498            License: Public Domain
00499 
00500            Author: stephan@s11n.net
00501            
00502            Based heavily off of work by:
00503            
00504            Martin Jones (mjones@kde.org), Torben Weis (weis@kde.org)
00505            and Waldo Bastian (bastian@kde.org)
00506 
00507            which i originally found as string_tokenizer in the KDE 1.x
00508            source tree. i have received explicit permission from each
00509            of those gentlemen to release the string_tokenizer code into
00510            into the Public Domain. (Many thanks to them for that
00511            permission!)
00512 
00513        This class is meant to be API- and behaviour-compatible
00514        with string_tokenizer. This implementation is, however,
00515        MUCH less efficient, and works on std::strings instead of
00516        C-style strings (const char *).
00517            
00518            stdstring_tokenizer tokenizes strings in a way which is
00519            consistent with the way a Unix shell does. This makes it
00520            appropriate for use in parsing many types of arbitrary user
00521            input, from command-line arguments to comma-separated
00522            files.
00523         */
00524     class stdstring_tokenizer
00525     {
00526           public:
00527         stdstring_tokenizer();
00528                 /**
00529                    Same as creating a stdstring_tokenizer and calling it's tokenize( str, separators ).
00530                  */
00531         stdstring_tokenizer( const std::string & str, const std::string & separators );
00532         ~stdstring_tokenizer();
00533 
00534         /**
00535                    str is split up at points matching any element in
00536                    separators. Adjecent separators in str are
00537                    interpreted as empty elements. Thus the string
00538                    "1;;3", separated by ";", has 3 tokens:
00539                    ("1","","3").
00540 
00541                    To collect the tokens, do this:
00542 
00543 <pre>
00544 stdstring_tokenizer tok( "some string", " " );
00545 while( tok.has_tokens() ) cout << "Token: " << tok.next_token() << endl;
00546 </pre>
00547                  */
00548         void tokenize( const std::string & str, const std::string & separators );
00549         /**
00550                    Returns the next token in our list.
00551                    Calling next_token() when has_tokens() returns
00552                    false has undefined behaviour.
00553                  */
00554         std::string next_token();
00555         /**
00556                    Returns true if this object has more tokens to give you.
00557                 */
00558         bool has_tokens() const;
00559 
00560           private:
00561         typedef std::queue < std::string > queue_type;
00562         queue_type m_list;
00563     };
00564 
00565 
00566 } } } // namespaces
00567 
00568 
00569 #endif // s11n_net_s11n_STRINGTOOL_HPP_INCLUDED

Generated on Sun Apr 27 13:16:04 2008 for libs11n by  doxygen 1.5.3