otk/ustring.hh (raw)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 2; -*- #ifndef __ustring_hh #define __ustring_hh /*! @file ustring.hh @brief Provides a simple UTF-8 encoded string */ extern "C" { #ifdef HAVE_STDINT_H # include <stdint.h> #else # ifdef HAVE_SYS_TYPES_H # include <sys/types.h> # endif #endif } #include <string> namespace otk { #ifdef HAVE_STDINT_H typedef uint32_t unichar; #else typedef u_int32_t unichar; #endif #ifndef DOXYGEN_IGNORE //! The number of bytes to skip to find the next character in the string const char utf8_skip[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 }; unichar utf8_get_char(const char *p); #endif // DOXYGEN_IGNORE //! The iterator type for ustring /*! Note this is not a random access iterator but a bidirectional one, since all index operations need to iterate over the UTF-8 data. Use std::advance() to move to a certain position. <p> A writeable iterator isn't provided because: The number of bytes of the old UTF-8 character and the new one to write could be different. Therefore, any write operation would invalidate all other iterators pointing into the same string. */ template <class T> class ustring_Iterator { public: typedef std::bidirectional_iterator_tag iterator_category; typedef unichar value_type; typedef std::string::difference_type difference_type; //typedef value_type reference; typedef void pointer; inline ustring_Iterator() {} inline ustring_Iterator(const ustring_Iterator<std::string::iterator>& other) : _pos(other.base()) {} inline value_type operator*() const { // get an iterator to the internal string std::string::const_iterator pos = _pos; return utf8_get_char(&(*pos)); } inline ustring_Iterator<T> & operator++() { pos_ += g_utf8_skip[static_cast<unsigned char>(*pos_)]; return *this; } inline ustring_Iterator<T> & operator--() { do { --_pos; } while((*_pos & '\xC0') == '\x80'); return *this; } explicit inline ustring_Iterator(T pos) : _pos(pos) {} inline T base() const { return _pos; } private: T _pos; }; //! This class provides a simple wrapper to a std::string that can be encoded //! as UTF-8. The ustring::utf() member specifies if the given string is UTF-8 //! encoded. ustrings default to specifying UTF-8 encoding. /*! This class does <b>not</b> handle extended 8-bit ASCII charsets like ISO-8859-1. <p> More info on Unicode and UTF-8 can be found here: http://www.cl.cam.ac.uk/~mgk25/unicode.html <p> This does not subclass std::string, because std::string was intended to be a final class. For instance, it does not have a virtual destructor. */ class ustring { std::string _string; bool _utf8; public: typedef std::string::size_type size_type; typedef std::string::difference_type difference_type; typedef unichar value_type; //typedef unichar & reference; //typedef const unichar & const_reference; //typedef ustring_Iterator<std::string::iterator> iterator; //typedef ustring_Iterator<std::string::const_iterator> const_iterator; static const size_type npos = std::string::npos; ustring(); ~ustring(); // make new strings ustring(const ustring& other); ustring& operator=(const ustring& other); ustring(const std::string& src); ustring(const char* src); // append to the string ustring& operator+=(const ustring& src); ustring& operator+=(const char* src); ustring& operator+=(char c); // sizes ustring::size_type size() const; ustring::size_type bytes() const; ustring::size_type capacity() const; ustring::size_type max_size() const; bool empty() const; // erase substrings void clear(); ustring& erase(size_type i, size_type n=npos); // change the string's size void resize(size_type n, char c='\0'); // extract characters // No reference return; use replace() to write characters. value_type operator[](size_type i) const; // internal data const char* data() const; const char* c_str() const; // encoding bool utf8() const; void setUtf8(bool utf8); }; } #endif // __ustring_hh |