OpenMW
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
utf8stream.hpp
Go to the documentation of this file.
1 #ifndef MISC_UTF8ITER_HPP
2 #define MISC_UTF8ITER_HPP
3 
4 #include <boost/tuple/tuple.hpp>
5 
6 class Utf8Stream
7 {
8 public:
9 
10  typedef uint32_t UnicodeChar;
11  typedef unsigned char const * Point;
12 
13  //static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
14  static UnicodeChar sBadChar () { return UnicodeChar (0xFFFFFFFF); }
15 
17  cur (begin), nxt (begin), end (end), val(Utf8Stream::sBadChar())
18  {
19  }
20 
21  Utf8Stream (std::pair <Point, Point> range) :
22  cur (range.first), nxt (range.first), end (range.second), val(Utf8Stream::sBadChar())
23  {
24  }
25 
26  bool eof () const
27  {
28  return cur == end;
29  }
30 
31  Point current () const
32  {
33  return cur;
34  }
35 
37  {
38  if (cur == nxt)
39  next ();
40  return val;
41  }
42 
44  {
45  if (cur == nxt)
46  next ();
47  cur = nxt;
48  return val;
49  }
50 
51  static std::pair <UnicodeChar, Point> decode (Point cur, Point end)
52  {
53  if ((*cur & 0x80) == 0)
54  {
55  UnicodeChar chr = *cur++;
56 
57  return std::make_pair (chr, cur);
58  }
59 
60  int octets;
61  UnicodeChar chr;
62 
63  boost::tie (octets, chr) = octet_count (*cur++);
64 
65  if (octets > 5)
66  return std::make_pair (sBadChar(), cur);
67 
68  Point eoc = cur + octets;
69 
70  if (eoc > end)
71  return std::make_pair (sBadChar(), cur);
72 
73  while (cur != eoc)
74  {
75  if ((*cur & 0xC0) != 0x80) // check continuation mark
76  return std::make_pair (sBadChar(), cur);
77 
78  chr = (chr << 6) | UnicodeChar ((*cur++) & 0x3F);
79  }
80 
81  return std::make_pair (chr, cur);
82  }
83 
84 private:
85 
86  static std::pair <int, UnicodeChar> octet_count (unsigned char octet)
87  {
88  int octets;
89 
90  unsigned char mark = 0xC0;
91  unsigned char mask = 0xE0;
92 
93  for (octets = 1; octets <= 5; ++octets)
94  {
95  if ((octet & mask) == mark)
96  break;
97 
98  mark = (mark >> 1) | 0x80;
99  mask = (mask >> 1) | 0x80;
100  }
101 
102  return std::make_pair (octets, octet & ~mask);
103  }
104 
105  void next ()
106  {
107  boost::tie (val, nxt) = decode (nxt, end);
108  }
109 
114 };
115 
116 #endif
unsigned char const * Point
Definition: utf8stream.hpp:11
Utf8Stream(Point begin, Point end)
Definition: utf8stream.hpp:16
bool eof() const
Definition: utf8stream.hpp:26
UnicodeChar consume()
Definition: utf8stream.hpp:43
Point end
Definition: utf8stream.hpp:112
static UnicodeChar sBadChar()
Definition: utf8stream.hpp:14
Definition: utf8stream.hpp:6
Utf8Stream(std::pair< Point, Point > range)
Definition: utf8stream.hpp:21
static std::pair< int, UnicodeChar > octet_count(unsigned char octet)
Definition: utf8stream.hpp:86
uint32_t UnicodeChar
Definition: utf8stream.hpp:10
static std::pair< UnicodeChar, Point > decode(Point cur, Point end)
Definition: utf8stream.hpp:51
Point current() const
Definition: utf8stream.hpp:31
Point nxt
Definition: utf8stream.hpp:111
void next()
Definition: utf8stream.hpp:105
UnicodeChar peek()
Definition: utf8stream.hpp:36
Point cur
Definition: utf8stream.hpp:110
UnicodeChar val
Definition: utf8stream.hpp:113