Boost.Locale
generic_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0.
5 // https://www.boost.org/LICENSE_1_0.txt
6 
7 #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
8 #define BOOST_LOCALE_GENERIC_CODECVT_HPP
9 
10 #include <boost/locale/utf.hpp>
11 #include <boost/cstdint.hpp>
12 #include <locale>
13 
14 namespace boost { namespace locale {
15 
16 #ifndef BOOST_LOCALE_DOXYGEN
17  //
18  // Make sure that mbstate can keep 16 bit of UTF-16 sequence
19  //
20  static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small");
21 #endif
22 
23 #if defined(_MSC_VER) && _MSC_VER < 1700
24 // up to MSVC 11 (2012) do_length is non-standard it counts wide characters instead of narrow and does not change
25 // mbstate
26 # define BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
27 #endif
28 
31  public:
36  };
37  };
38 
142  template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
144 
151  template<typename CharType, typename CodecvtImpl>
152  class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
153  public generic_codecvt_base {
154  public:
155  typedef CharType uchar;
156 
157  generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
158  const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
159 
160  protected:
161  std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
162  {
163  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&s);
164  if(state != 0)
165  return std::codecvt_base::error;
166  next = from;
167  return std::codecvt_base::ok;
168  }
169  int do_encoding() const noexcept override { return 0; }
170  int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
171  bool do_always_noconv() const noexcept override { return false; }
172 
173  int do_length(
174 #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
175  const
176 #endif
177  std::mbstate_t& std_state,
178  const char* from,
179  const char* from_end,
180  size_t max) const override
181  {
182 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
183  const char* save_from = from;
184  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
185 #else
186  const size_t start_max = max;
187  boost::uint16_t state = *reinterpret_cast<const boost::uint16_t*>(&std_state);
188 #endif
189 
190  typename CodecvtImpl::state_type cvt_state =
191  implementation().initial_state(generic_codecvt_base::to_unicode_state);
192  while(max > 0 && from < from_end) {
193  const char* prev_from = from;
194  boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
196  from = prev_from;
197  break;
198  }
199  max--;
200  if(ch > 0xFFFF) {
201  if(state == 0) {
202  from = prev_from;
203  state = 1;
204  } else {
205  state = 0;
206  }
207  }
208  }
209 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
210  return static_cast<int>(from - save_from);
211 #else
212  return static_cast<int>(start_max - max);
213 #endif
214  }
215 
216  std::codecvt_base::result do_in(std::mbstate_t& std_state,
217  const char* from,
218  const char* from_end,
219  const char*& from_next,
220  uchar* to,
221  uchar* to_end,
222  uchar*& to_next) const override
223  {
224  std::codecvt_base::result r = std::codecvt_base::ok;
225 
226  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
227  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
228  //
229  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
230  // and first pair is written, but no input consumed
231  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
232  typename CodecvtImpl::state_type cvt_state =
233  implementation().initial_state(generic_codecvt_base::to_unicode_state);
234  while(to < to_end && from < from_end) {
235  const char* from_saved = from;
236 
237  uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
238 
239  if(ch == boost::locale::utf::illegal) {
240  from = from_saved;
241  r = std::codecvt_base::error;
242  break;
243  }
245  from = from_saved;
246  r = std::codecvt_base::partial;
247  break;
248  }
249  // Normal codepoints go directly to stream
250  if(ch <= 0xFFFF) {
251  *to++ = static_cast<uchar>(ch);
252  } else {
253  // For other codepoints we do the following
254  //
255  // 1. We can't consume our input as we may find ourselves
256  // in state where all input consumed but not all output written,i.e. only
257  // 1st pair is written
258  // 2. We only write first pair and mark this in the state, we also revert back
259  // the from pointer in order to make sure this codepoint would be read
260  // once again and then we would consume our input together with writing
261  // second surrogate pair
262  ch -= 0x10000;
263  boost::uint16_t w1 = static_cast<boost::uint16_t>(0xD800 | (ch >> 10));
264  boost::uint16_t w2 = static_cast<boost::uint16_t>(0xDC00 | (ch & 0x3FF));
265  if(state == 0) {
266  from = from_saved;
267  *to++ = w1;
268  state = 1;
269  } else {
270  *to++ = w2;
271  state = 0;
272  }
273  }
274  }
275  from_next = from;
276  to_next = to;
277  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
278  r = std::codecvt_base::partial;
279  return r;
280  }
281 
282  std::codecvt_base::result do_out(std::mbstate_t& std_state,
283  const uchar* from,
284  const uchar* from_end,
285  const uchar*& from_next,
286  char* to,
287  char* to_end,
288  char*& to_next) const override
289  {
290  std::codecvt_base::result r = std::codecvt_base::ok;
291  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
292  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
293  // to be able to store first observed surrogate pair
294  //
295  // State: state!=0 - a first surrogate pair was observed (state = first pair),
296  // we expect the second one to come and then zero the state
297  boost::uint16_t& state = *reinterpret_cast<boost::uint16_t*>(&std_state);
298  typename CodecvtImpl::state_type cvt_state =
299  implementation().initial_state(generic_codecvt_base::from_unicode_state);
300  while(to < to_end && from < from_end) {
301  boost::uint32_t ch = 0;
302  if(state != 0) {
303  // if the state indicates that 1st surrogate pair was written
304  // we should make sure that the second one that comes is actually
305  // second surrogate
306  boost::uint16_t w1 = state;
307  boost::uint16_t w2 = *from;
308  // we don't forward from as writing may fail to incomplete or
309  // partial conversion
310  if(0xDC00 <= w2 && w2 <= 0xDFFF) {
311  boost::uint16_t vh = w1 - 0xD800;
312  boost::uint16_t vl = w2 - 0xDC00;
313  ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
314  } else {
315  // Invalid surrogate
316  r = std::codecvt_base::error;
317  break;
318  }
319  } else {
320  ch = *from;
321  if(0xD800 <= ch && ch <= 0xDBFF) {
322  // if this is a first surrogate pair we put
323  // it into the state and consume it, note we don't
324  // go forward as it should be illegal so we increase
325  // the from pointer manually
326  state = static_cast<uint16_t>(ch);
327  from++;
328  continue;
329  } else if(0xDC00 <= ch && ch <= 0xDFFF) {
330  // if we observe second surrogate pair and
331  // first only may be expected we should break from the loop with error
332  // as it is illegal input
333  r = std::codecvt_base::error;
334  break;
335  }
336  }
338  r = std::codecvt_base::error;
339  break;
340  }
341  boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
342  if(len == boost::locale::utf::incomplete) {
343  r = std::codecvt_base::partial;
344  break;
345  } else if(len == boost::locale::utf::illegal) {
346  r = std::codecvt_base::error;
347  break;
348  } else
349  to += len;
350  state = 0;
351  from++;
352  }
353  from_next = from;
354  to_next = to;
355  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
356  r = std::codecvt_base::partial;
357  return r;
358  }
359  };
360 
365  template<typename CharType, typename CodecvtImpl>
366  class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
367  public generic_codecvt_base {
368  public:
369  typedef CharType uchar;
370 
371  generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
372 
373  const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
374 
375  protected:
376  std::codecvt_base::result
377  do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
378  {
379  next = from;
380  return std::codecvt_base::ok;
381  }
382  int do_encoding() const noexcept override { return 0; }
383  int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
384  bool do_always_noconv() const noexcept override { return false; }
385 
386  int do_length(
387 #ifdef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
388  const
389 #endif
390  std::mbstate_t& /*state*/,
391  const char* from,
392  const char* from_end,
393  size_t max) const override
394  {
395 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
396  const char* start_from = from;
397 #else
398  const size_t start_max = max;
399 #endif
400  typename CodecvtImpl::state_type cvt_state =
401  implementation().initial_state(generic_codecvt_base::to_unicode_state);
402  while(max > 0 && from < from_end) {
403  const char* save_from = from;
404  boost::uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
406  from = save_from;
407  break;
408  }
409  max--;
410  }
411 
412 #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
413  return static_cast<int>(from - start_from);
414 #else
415  return static_cast<int>(start_max - max);
416 #endif
417  }
418 
419  std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
420  const char* from,
421  const char* from_end,
422  const char*& from_next,
423  uchar* to,
424  uchar* to_end,
425  uchar*& to_next) const override
426  {
427  std::codecvt_base::result r = std::codecvt_base::ok;
428 
429  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
430  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
431  //
432  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
433  // and first pair is written, but no input consumed
434  auto cvt_state = implementation().initial_state(generic_codecvt_base::to_unicode_state);
435  while(to < to_end && from < from_end) {
436  const char* from_saved = from;
437 
438  uint32_t ch = implementation().to_unicode(cvt_state, from, from_end);
439 
440  if(ch == boost::locale::utf::illegal) {
441  r = std::codecvt_base::error;
442  from = from_saved;
443  break;
444  }
446  r = std::codecvt_base::partial;
447  from = from_saved;
448  break;
449  }
450  *to++ = ch;
451  }
452  from_next = from;
453  to_next = to;
454  if(r == std::codecvt_base::ok && from != from_end)
455  r = std::codecvt_base::partial;
456  return r;
457  }
458 
459  std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
460  const uchar* from,
461  const uchar* from_end,
462  const uchar*& from_next,
463  char* to,
464  char* to_end,
465  char*& to_next) const override
466  {
467  std::codecvt_base::result r = std::codecvt_base::ok;
468  auto cvt_state = implementation().initial_state(generic_codecvt_base::from_unicode_state);
469  while(to < to_end && from < from_end) {
470  boost::uint32_t ch = 0;
471  ch = *from;
473  r = std::codecvt_base::error;
474  break;
475  }
476  boost::uint32_t len = implementation().from_unicode(cvt_state, ch, to, to_end);
477  if(len == boost::locale::utf::incomplete) {
478  r = std::codecvt_base::partial;
479  break;
480  } else if(len == boost::locale::utf::illegal) {
481  r = std::codecvt_base::error;
482  break;
483  }
484  to += len;
485  from++;
486  }
487  from_next = from;
488  to_next = to;
489  if(r == std::codecvt_base::ok && from != from_end)
490  r = std::codecvt_base::partial;
491  return r;
492  }
493  };
494 
495  template<typename CharType, typename CodecvtImpl>
496  class generic_codecvt<CharType, CodecvtImpl, 1> : public std::codecvt<CharType, char, std::mbstate_t>,
497  public generic_codecvt_base {
498  public:
499  typedef CharType uchar;
500 
501  const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
502 
503  generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
504  };
505 
506 }} // namespace boost::locale
507 
508 #endif
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:27
The state would be used by to_unicode functions.
Definition: generic_codecvt.hpp:34
initial_convertion_state
Initial state for converting to or from unicode code points, used by initial_state in derived classes...
Definition: generic_codecvt.hpp:33
A base class that used to define constants for generic_codecvt.
Definition: generic_codecvt.hpp:30
Generic codecvt facet for various stateless encodings to UTF-16 and UTF-32 using wchar_t,...
Definition: generic_codecvt.hpp:143
constexpr code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:22
The state would be used by from_unicode functions.
Definition: generic_codecvt.hpp:35
constexpr code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:24