The Battle for Wesnoth  1.13.4+dev
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
ucs4_convert_impl.hpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 - 2016 by David White <[email protected]>
3  Part of the Battle for Wesnoth Project http://www.wesnoth.org/
4 
5  This program is free software; you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation; either version 2 of the License, or
8  (at your option) any later version.
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY.
11 
12  See the COPYING file for more details.
13 */
14 
15 #ifndef SERIALIZATION_UCS4_CONVERT_IMPL_HPP_INCLUDED
16 #define SERIALIZATION_UCS4_CONVERT_IMPL_HPP_INCLUDED
17 
18 #include "unicode_types.hpp"
19 #include "util.hpp"
20 #include <cassert>
21 
23 {
24  struct utf8_impl
25  {
26  static const char* get_name() { return "utf8"; }
28  {
29  if(ch < (1u << 7))
30  return 1;
31  else if(ch < (1u << 11))
32  return 2;
33  else if(ch < (1u << 16))
34  return 3;
35  else if(ch < (1u << 21))
36  return 4;
37  else if(ch < (1u << 26))
38  return 5;
39  else if(ch < (1u << 31))
40  return 6;
41  else
42  throw utf8::invalid_utf8_exception(); // Invalid UCS-4
43  }
44 
46  {
47  if (!(ch & 0x80)) {
48  return 1; // US-ASCII character, 1 byte
49  }
50  /* first bit set: character not in US-ASCII, multiple bytes
51  * number of set bits at the beginning = bytes per character
52  * e.g. 11110xxx indicates a 4-byte character */
53  int count = count_leading_ones(ch);
54  if (count == 1 || count > 6) { // count > 4 after RFC 3629
55  throw utf8::invalid_utf8_exception(); // Stop on invalid characters
56  }
57  return count;
58  }
59 
60  /**
61  * Writes a UCS-4 character to a UTF-8 stream.
62  *
63  * @param out An object to write utf8::char_t. Required operations:
64  * 1) push(utf8::char_t) to write a single character
65  * 2) can_push(size_t n) to check whether there is still
66  * enough space for n characters.
67  * @param ch The UCS-4 character to write to the stream.
68  */
69  template<typename writer>
70  static inline void write(writer out, ucs4::char_t ch)
71  {
73  assert(out.can_push(count));
74  if(count == 1) {
75  out.push(static_cast<utf8::char_t>(ch));
76  } else {
77  for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
78  unsigned char c = (ch >> (6 * j)) & 0x3f;
79  c |= 0x80;
80  if(j == static_cast<int>(count) - 1) {
81  c |= 0xff << (8 - count);
82  }
83  out.push(c);
84  }
85  }
86  }
87  /**
88  * Reads a UCS-4 character from a UTF-8 stream
89  *
90  * @param input An iterator pointing to the first character of a UTF-8
91  * sequence to read.
92  * @param end An iterator pointing to the end of the UTF-8 sequence
93  * to read.
94  */
95  template<typename iitor_t>
96  static inline ucs4::char_t read(iitor_t& input, const iitor_t& end)
97  {
98  assert(input != end);
99  size_t size = byte_size_from_utf8_first(*input);
100 
101  ucs4::char_t current_char = static_cast<unsigned char>(*input);
102 
103  // Convert the first character
104  if(size != 1) {
105  current_char &= 0xFF >> (size + 1);
106  }
107 
108  // Convert the continuation bytes
109  // i == number of '++input'
110  ++input;
111  for(size_t i = 1; i < size; ++i, ++input) {
112  // If the string ends occurs within an UTF8-sequence, this is bad.
113  if (input == end)
115 
116  if ((*input & 0xC0) != 0x80)
118 
119  current_char = (current_char << 6) | (static_cast<unsigned char>(*input) & 0x3F);
120  }
121  //i == size => input was increased size times.
122 
123  // Check for non-shortest-form encoding
124  // This has been forbidden in Unicode 3.1 for security reasons
125  if (size > byte_size_from_ucs4_codepoint(current_char))
127  return current_char;
128  }
129  };
130 
131  struct utf16_impl
132  {
133  static const char* get_name() { return "utf16"; }
134  template<typename writer>
135  static inline void write(writer out, ucs4::char_t ch)
136  {
137  const ucs4::char_t bit17 = 0x10000;
138 
139  if(ch < bit17)
140  {
141  assert(out.can_push(1));
142  out.push(static_cast<utf16::char_t>(ch));
143  }
144  else
145  {
146  assert(out.can_push(2));
147  const ucs4::char_t char20 = ch - bit17;
148  assert(char20 < (1 << 20));
149  const ucs4::char_t lead = 0xD800 + (char20 >> 10);
150  const ucs4::char_t trail = 0xDC00 + (char20 & 0x3FF);
151  assert(lead < bit17);
152  assert(trail < bit17);
153  out.push(static_cast<utf16::char_t>(lead));
154  out.push(static_cast<utf16::char_t>(trail));
155  }
156  }
157 
158  template<typename iitor_t>
159  static inline ucs4::char_t read(iitor_t& input, const iitor_t& end)
160  {
161  const ucs4::char_t last10 = 0x3FF;
162  const ucs4::char_t type_filter = 0xFC00;
163  const ucs4::char_t type_lead = 0xD800;
164  const ucs4::char_t type_trail = 0xDC00;
165 
166  assert(input != end);
167  ucs4::char_t current_char = static_cast<utf16::char_t>(*input);
168  ++input;
169  ucs4::char_t type = current_char & type_filter;
170  if(type == type_trail)
171  {
172  //found trail without head
174  }
175  else if(type == type_lead)
176  {
177  if(input == end)
178  {
179  //If the string ends occurs within an UTF16-sequence, this is bad.
181  }
182  if((*input & type_filter) != type_trail)
183  {
185  }
186  current_char &= last10;
187  current_char <<= 10;
188  current_char += (*input & last10);
189  current_char += 0x10000;
190  ++input;
191  }
192  return current_char;
193  }
194  };
195 
196  struct utf32_impl
197  {
198  static const char* get_name() { return "UCS4"; }
199  template<typename writer>
200  static inline void write(writer out, ucs4::char_t ch)
201  {
202  assert(out.can_push(1));
203  out.push(ch);
204  }
205 
206  template<typename iitor_t>
207  static inline ucs4::char_t read(iitor_t& input, const iitor_t& end)
208  {
209  assert(input != end);
210  ucs4::char_t current_char = *input;
211  ++input;
212  return current_char;
213  }
214  };
215 
216  template<typename T_CHAR>
217  struct convert_impl {};
218 
219  template<>
221  {
222  typedef utf8_impl type;
223  };
224 
225  template<>
227  {
228  typedef utf16_impl type;
229  };
230 
231  template<>
233  {
234  typedef utf32_impl type;
235  };
236 }
237 
238 #endif
static ucs4::char_t read(iitor_t &input, const iitor_t &end)
static size_t byte_size_from_ucs4_codepoint(ucs4::char_t ch)
unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit...
Definition: util.hpp:415
char char_t
static void write(writer out, ucs4::char_t ch)
const GLfloat * c
Definition: glew.h:12741
GLuint GLuint GLsizei GLenum type
Definition: glew.h:1221
GLenum GLenum GLenum input
Definition: glew.h:10668
For Win32 API.
Definition: unicode.hpp:33
GLuint GLuint end
Definition: glew.h:1221
static ucs4::char_t read(iitor_t &input, const iitor_t &end)
static void write(writer out, ucs4::char_t ch)
Templates and utility-routines for strings and numbers.
GLuint GLuint GLsizei count
Definition: glew.h:1221
static const char * get_name()
static int byte_size_from_utf8_first(utf8::char_t ch)
Functions for converting Unicode wide-char strings to UTF-8 encoded strings, back and forth...
Definition: unicode.cpp:36
Thrown by operations encountering invalid UTF-8 data.
wchar_t char_t
static ucs4::char_t read(iitor_t &input, const iitor_t &end)
Reads a UCS-4 character from a UTF-8 stream.
boost::uint32_t char_t
size_t i
Definition: function.cpp:1057
static int writer(lua_State *L, const void *b, size_t size, void *B)
Definition: lstrlib.cpp:166
static void write(writer out, ucs4::char_t ch)
Writes a UCS-4 character to a UTF-8 stream.
GLsizeiptr size
Definition: glew.h:1649