00001 #ifndef s11n_net_s11n_STRINGTOOL_HPP_INCLUDED 00002 #define s11n_net_s11n_STRINGTOOL_HPP_INCLUDED 1 00003 00004 #include <string> 00005 #include <map> 00006 #include <locale> 00007 #include <iostream> 00008 #include <sstream> 00009 #include <queue> 00010 namespace s11n { namespace io { 00011 /** 00012 The strtool namespace encapsulates a set of utility functions for 00013 working with string objects. This mini-lib has unfortunately followed 00014 me from source tree to source tree like a little virus. While i have 00015 no special love for this code, it has proven useful time and time again. 00016 */ 00017 namespace strtool { 00018 00019 /** 00020 The functions in the Private namespace should not be used 00021 by client code. 00022 */ 00023 namespace STPrivate 00024 { 00025 00026 00027 /** 00028 Lexically casts str to a value_type, returning 00029 errorVal if the conversion fails. 00030 00031 TODO: implement the following suggestion from 00032 Kai Unger <kai.unger@hacon.de> (21 Sept 2004): 00033 00034 When the cast is done, you should check if there 00035 are unread characters left. For example, casting 00036 "1.2this_definitly_is_not_a_number" to double will 00037 not result in returning the error value, because 00038 conversion of "1.2" to 1.2d succeeds and the rest 00039 of the string is ignored. 00040 */ 00041 template <typename value_type> 00042 value_type from_string( const std::string & str, const value_type & errorVal ) throw() 00043 { 00044 std::istringstream is( str ); 00045 if ( !is ) 00046 return errorVal; 00047 value_type foo = value_type(); 00048 if ( is >> foo ) 00049 return foo; 00050 return errorVal; 00051 } 00052 00053 /** 00054 Returns a string representation of the given 00055 object, which must be ostreamble. 00056 */ 00057 template <typename value_type> 00058 std::string to_string( const value_type & obj ) throw() 00059 { 00060 std::ostringstream os; 00061 // os << std::fixed; 00062 os << obj; 00063 return os.str(); 00064 } 00065 00066 // inline std::string to_string( double d ) throw() 00067 // { 00068 // std::ostringstream os; 00069 // os << std::fixed << d; 00070 // return os.str(); 00071 // } 00072 00073 /** 00074 Convenience/efficiency overload. 00075 */ 00076 inline std::string from_string( const std::string & str, const std::string & /*errorVal*/ ) throw() 00077 { 00078 return str; 00079 } 00080 00081 /** 00082 Convenience/efficiency overload. 00083 */ 00084 inline std::string from_string( const char *str, const char * /*errorVal*/ ) throw() 00085 { 00086 return str; 00087 } 00088 00089 /** 00090 Convenience/efficiency overload. 00091 */ 00092 inline std::string to_string( const char *obj ) throw() 00093 { 00094 return obj ? obj : ""; 00095 } 00096 00097 /** 00098 Convenience/efficiency overload. 00099 */ 00100 inline std::string to_string( const std::string & obj ) throw() 00101 { 00102 return obj; 00103 } 00104 00105 00106 } // end STPrivate namespace 00107 00108 /** 00109 Convenience typedef for use with translate_entities(). 00110 */ 00111 typedef std::map<std::string,std::string> entity_map; 00112 00113 /** 00114 For each entry in the input string, the characters are 00115 mapped to string sequences using the given 00116 translation_map. Where no mappings exist, the input 00117 sequence is left as-is. 00118 00119 It returns the number of translations made. 00120 00121 If reverse_translation == true then a reverse mapping is 00122 done: map values are treated as keys. 00123 00124 This is useful, for example, for doing XML-entity-to-char 00125 conversions. 00126 00127 Complexity is essentially linear, based on a combination of 00128 buffer.size() and translation_map.size(). Best used with 00129 small maps on short strings! The speed can be increased 00130 signifcantly, but probably only if we restrict keys and 00131 values to 1 character each. 00132 00133 Design note: this really should be a function template, 00134 accepting any lexically-castable key/val types, but the 00135 function is quite long, and therefore not really suitable 00136 for inclusion in the header. 00137 */ 00138 std::size_t translate_entities( std::string & buffer, const entity_map & translation_map, bool reverse_translation = false ); 00139 00140 00141 /** 00142 A policy enum used by trim_string(). 00143 */ 00144 enum TrimPolicy { 00145 /** 00146 Trim only leading spaces. 00147 */ 00148 TrimLeading = 0x01, 00149 /** 00150 Trim only trailing spaces. 00151 */ 00152 TrimTrailing = 0x02, 00153 /** 00154 Trim leading and trailing spaces. 00155 */ 00156 TrimAll = TrimLeading | TrimTrailing 00157 }; 00158 00159 /** 00160 Trims leading and trailing whitespace from the input string 00161 and returns the number of whitespace characters removed. 00162 */ 00163 std::size_t trim_string( std::string &, TrimPolicy = TrimAll ); 00164 /** 00165 Trims leading and trailing whitespace from the input string 00166 and returns the trimmed string. 00167 */ 00168 std::string trim_string( const std::string &, TrimPolicy = TrimAll ); 00169 00170 00171 /** 00172 Attempts to remove all backslash-escaped chars from str. 00173 00174 Removes backslash-escaped newlines from the input string, including 00175 any whitespace immediately following each backslash. 00176 00177 The optional slash parameter defines the escape character. 00178 */ 00179 std::size_t strip_slashes( std::string &str, const char slash = '\\' ); 00180 00181 /** 00182 Adds an escape sequence in front of any characters in 00183 instring which are also in the list of chars_to_escape. 00184 Returns the number of escapes added. 00185 00186 e.g., to escape (with a single backslash) all $, % and \ in 00187 mystring with a backslash: 00188 00189 <pre> 00190 escape_string( mystring, "$%\\", "\\" ); 00191 </pre> 00192 00193 (WARNING: the doxygen-generated HTML version of these docs 00194 may incorrectly show single backslashes in the above example!) 00195 00196 00197 */ 00198 std::size_t escape_string( std::string & instring, const std::string & chars_to_escape, const std::string & escape_seq = "\\" ); 00199 00200 /** 00201 normalize_string() is like trim_string() and 00202 strip_slashes(), combined, plus it removes leading/trailing 00203 quotes: 00204 00205 <pre> 00206 "this is a \ 00207 sample multi-line, backslash-escaped \ 00208 string." 00209 </pre> 00210 00211 Will translate to: 00212 <pre> 00213 this is a sample multi-line, backslash-escaped string. 00214 </pre> 00215 */ 00216 void normalize_string( std::string & ); 00217 00218 00219 /** 00220 Returns the first whitespace-delimited token from the given 00221 string, or an empty string if there is no such token. 00222 */ 00223 std::string first_token( const std::string & ); 00224 00225 /** 00226 Returns the passed-in string, minus the first 00227 whitespace-delimited token. An empty string is returned if 00228 there is no second token. 00229 */ 00230 std::string after_first_token( const std::string & ); 00231 00232 00233 00234 /** 00235 Returns int values for chars '0'-'9', 'a'-'f' and 'A'-'F', 00236 else -1. 00237 */ 00238 int int4hexchar( char character ); 00239 00240 /** 00241 Returns decimal value of wd, which is assumed to be a 00242 hex-encoded number. wd may optionally be prefixed with '#', 00243 as in \#ff00ff. Case is insignificant. 00244 00245 On error -1 is returned, but -1 is also potentially a valid 00246 number, so there is really no way of knowing if it fails or 00247 not. :/ 00248 */ 00249 int hex2int( const std::string & wd ); 00250 00251 00252 /** 00253 Lexically casts v to a string. 00254 */ 00255 template <typename ValueT> 00256 std::string to( const ValueT & v ) 00257 { 00258 return STPrivate::to_string(v); 00259 } 00260 00261 /** 00262 Lexically casts v to a ValueT, or returns dflt if 00263 conversion fails. 00264 */ 00265 template <typename ValueT> 00266 ValueT from( const std::string & v, const ValueT & dflt = ValueT() ) 00267 { 00268 return STPrivate::from_string( v, dflt ); 00269 } 00270 00271 00272 /** 00273 See translate_entities() for details. 00274 */ 00275 typedef std::map<std::string,std::string> entity_map; 00276 00277 00278 /** 00279 YAGNI! 00280 00281 A functor for translating entities in a set of strings. 00282 Designed for use with std::for_each(). 00283 */ 00284 struct entity_translator 00285 { 00286 /** 00287 Sets the map and reverse options to be used from 00288 calls to operator(). 00289 */ 00290 entity_translator( const entity_map & map, bool reverse ) 00291 : m_map(&map),m_rev(reverse) 00292 { 00293 } 00294 00295 /** 00296 Calls translate_entities( str, MAP, REVERSE ), 00297 where MAP and REVERSE are the flags set via the 00298 ctor. 00299 */ 00300 inline void operator()( std::string & str ) const 00301 { 00302 translate_entities( str, *(this->m_map), this->m_rev ); 00303 } 00304 private: 00305 const entity_map * m_map; 00306 bool m_rev; 00307 00308 }; 00309 00310 /** 00311 Internal-use initializer for setting up an entity 00312 translation map for default quote-escaping behaviour. 00313 */ 00314 struct default_escapes_initializer 00315 { 00316 /** 00317 Adds the following escape sequences to map: 00318 00319 - 1x backslash (\) == 2x backslash. 00320 00321 - 1x apostrophe == 1x backslash 1x apostrophe 00322 00323 - 1x double-quote == 1x backslash 1x double-quote 00324 */ 00325 void operator()( entity_map & map ); 00326 }; 00327 00328 00329 /** Internal marker type. */ 00330 template <typename ContextT> struct strtool_sharing_context {}; 00331 00332 /** 00333 Returns the default entity translation map, which can be used to 00334 [un]slash-escape the folling entities: '\\', '\'', '"'. 00335 */ 00336 const entity_map & default_escapes_translations(); 00337 00338 /** 00339 Converts v to a string, applies translate_entities(...,trans,reverse ), 00340 and returns the resulting string. 00341 */ 00342 template <typename ValueT> 00343 std::string translate( const ValueT & v, 00344 const entity_map & trans, 00345 bool reverse ) 00346 { 00347 std::string val = to( v ); 00348 translate_entities( val, trans, reverse ); 00349 return val; 00350 } 00351 00352 00353 /** 00354 Calls translate( v,trans, false); 00355 */ 00356 template <typename ValueT> 00357 std::string escape( const ValueT & v, const entity_map & trans = default_escapes_translations() ) 00358 { 00359 return translate( v, trans, false ); 00360 } 00361 00362 00363 /** 00364 Calls translate( v, trans, true ); 00365 */ 00366 template <typename ValueT> 00367 std::string unescape( const ValueT & v, const entity_map & trans = default_escapes_translations() ) 00368 { 00369 return translate( v, trans, true ); 00370 } 00371 00372 /** 00373 Returns v as a quoted string, using the given quote 00374 character. 00375 */ 00376 template <typename ValueT> 00377 std::string quote( const ValueT & v, const std::string & quote = "\'" ) 00378 { 00379 return quote + to( v ) + quote; 00380 } 00381 00382 /** 00383 Exactly like expand_dollar_refs_inline() but returns a new string 00384 which results from the expansions. The returned string may 00385 be the same as the original. 00386 00387 */ 00388 std::string expand_dollar_refs( const std::string & text, const entity_map & src ); 00389 00390 /** 00391 Parsed env vars out of buffer, replacing them with their 00392 values, as defined in the src map. Accepts variables 00393 in the format ${VAR} and $VAR. 00394 00395 e.g., ${foo} corresponds to the value set in src["foo"]. 00396 00397 Referencing a variable which is not set does not 00398 expand the variable to an empty value: it is left 00399 as-is. Thus expanding ${FOO} when "FOO" is not set 00400 will result in "${FOO}". 00401 00402 To get a dollar sign into the resulting string, escape 00403 it with a single backslash: this keeps it from being 00404 parsed as a ${variable}. 00405 00406 Returns the number of variables expanded. 00407 00408 Note that this function is much *more* efficient than using 00409 translate_entities() to perform a similar operation. 00410 Because of it's stricter format we can do a single pass 00411 through the string and may not even have to reference the 00412 source map. 00413 00414 Complexity depends on the number of ${vars} parts are expanded 00415 in buffer: overall runtime depends on buffer length, 00416 plus a non-determinate amount of time per ${var} expanded. 00417 00418 Design note: this really should be a function template, 00419 accepting any lexically-castable key/val types, but the 00420 function is quite long, and therefore not really suitable 00421 to inclusion in the header. 00422 00423 00424 Known misgivings: 00425 00426 - When buffer contains dollar signs which are preceeded by 00427 a slash, the slash is stripped even if the $ does not 00428 expand to anything. This is arguably behaviour. 00429 */ 00430 std::size_t expand_dollar_refs_inline( std::string & buffer, const entity_map & src ); 00431 00432 00433 /** 00434 string_tokenizer is a... well, a string tokenizer, modelled after 00435 Java's java.util.string_tokenizer class. 00436 00437 This code used to be part of the KDE 1.x libraries: (named StringTokenizer) 00438 00439 Copyright (C) 1997 Martin Jones (mjones@kde.org), 00440 (C) 1997 Torben Weis (weis@kde.org), and 00441 (C) 1998 Waldo Bastian (bastian@kde.org) 00442 00443 Then this code was part of the QUB project: 00444 00445 Copyright (C) 2000-2003 stephan beal (sgbeal@users.sourceforge.net) 00446 and Rusty Ballinger (bozo@users.sourceforge.net) 00447 00448 THIS code is part of the s11n project, and is maintained by 00449 stephan@s11n.net. i have been graciously granted explicit 00450 permission from the three original authors to release this 00451 code into the Public Domain, and this copy falls under that 00452 "license." (The original license was GNU GPL.) 00453 */ 00454 00455 class string_tokenizer 00456 { 00457 public: 00458 string_tokenizer(); 00459 ~string_tokenizer(); 00460 00461 /** 00462 Sets the token list and separator to be used by 00463 subsequent next_token() calls. 00464 00465 It is important that the strings not be 00466 destroyed/freed by the client before this object is 00467 done with them. That is, do not call tokenize(), 00468 then free the strings, then call has_tokens() or 00469 next_token(). (In practice, this has never happened.) 00470 */ 00471 void tokenize( const char * sequence, const char * separator ); 00472 00473 /** 00474 Returns the next token in the list. Results are 00475 undefined if this method is called when 00476 has_tokens() returns false. 00477 */ 00478 const char* next_token(); 00479 00480 /** 00481 Returns true if this object has another token to 00482 return via next_token(). 00483 */ 00484 bool has_tokens(); 00485 00486 private: 00487 char *pos; 00488 char *end; 00489 char *buffer; 00490 int bufLen; 00491 }; 00492 00493 00494 00495 /** 00496 stdstring_tokenizer: 00497 00498 License: Public Domain 00499 00500 Author: stephan@s11n.net 00501 00502 Based heavily off of work by: 00503 00504 Martin Jones (mjones@kde.org), Torben Weis (weis@kde.org) 00505 and Waldo Bastian (bastian@kde.org) 00506 00507 which i originally found as string_tokenizer in the KDE 1.x 00508 source tree. i have received explicit permission from each 00509 of those gentlemen to release the string_tokenizer code into 00510 into the Public Domain. (Many thanks to them for that 00511 permission!) 00512 00513 This class is meant to be API- and behaviour-compatible 00514 with string_tokenizer. This implementation is, however, 00515 MUCH less efficient, and works on std::strings instead of 00516 C-style strings (const char *). 00517 00518 stdstring_tokenizer tokenizes strings in a way which is 00519 consistent with the way a Unix shell does. This makes it 00520 appropriate for use in parsing many types of arbitrary user 00521 input, from command-line arguments to comma-separated 00522 files. 00523 */ 00524 class stdstring_tokenizer 00525 { 00526 public: 00527 stdstring_tokenizer(); 00528 /** 00529 Same as creating a stdstring_tokenizer and calling it's tokenize( str, separators ). 00530 */ 00531 stdstring_tokenizer( const std::string & str, const std::string & separators ); 00532 ~stdstring_tokenizer(); 00533 00534 /** 00535 str is split up at points matching any element in 00536 separators. Adjecent separators in str are 00537 interpreted as empty elements. Thus the string 00538 "1;;3", separated by ";", has 3 tokens: 00539 ("1","","3"). 00540 00541 To collect the tokens, do this: 00542 00543 <pre> 00544 stdstring_tokenizer tok( "some string", " " ); 00545 while( tok.has_tokens() ) cout << "Token: " << tok.next_token() << endl; 00546 </pre> 00547 */ 00548 void tokenize( const std::string & str, const std::string & separators ); 00549 /** 00550 Returns the next token in our list. 00551 Calling next_token() when has_tokens() returns 00552 false has undefined behaviour. 00553 */ 00554 std::string next_token(); 00555 /** 00556 Returns true if this object has more tokens to give you. 00557 */ 00558 bool has_tokens() const; 00559 00560 private: 00561 typedef std::queue < std::string > queue_type; 00562 queue_type m_list; 00563 }; 00564 00565 00566 } } } // namespaces 00567 00568 00569 #endif // s11n_net_s11n_STRINGTOOL_HPP_INCLUDED