The Battle for Wesnoth  1.13.4+dev
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
unicode.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2003 by David White <[email protected]>
3  Copyright (C) 2005 by Guillaume Melquiond <[email protected]>
4  Copyright (C) 2005 - 2016 by Philippe Plantier <[email protected]>
5  Part of the Battle for Wesnoth Project http://www.wesnoth.org/
6 
7  This program is free software; you can redistribute it and/or modify
8  it under the terms of the GNU General Public License as published by
9  the Free Software Foundation; either version 2 of the License, or
10  (at your option) any later version.
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY.
13 
14  See the COPYING file for more details.
15 */
16 
17 /**
18  * @file
19  * Unicode support functions.
20  */
21 
22 #include "global.hpp"
23 #include "ucs4_convert_impl.hpp"
24 #include "unicode_cast.hpp"
26 
27 #include "log.hpp"
28 #include "util.hpp"
29 
30 #include <cassert>
31 #include <limits>
32 
33 static lg::log_domain log_engine("engine");
34 #define ERR_GENERAL LOG_STREAM(err, lg::general())
35 
36 namespace utf8 {
37 
38 static int byte_size_from_utf8_first(const unsigned char ch)
39 {
40  if (!(ch & 0x80)) {
41  return 1; // US-ASCII character, 1 byte
42  }
43  /* first bit set: character not in US-ASCII, multiple bytes
44  * number of set bits at the beginning = bytes per character
45  * e.g. 11110xxx indicates a 4-byte character */
46  int count = count_leading_ones(ch);
47  if (count == 1 || count > 6) { // count > 4 after RFC 3629
48  throw invalid_utf8_exception(); // Stop on invalid characters
49  }
50  return count;
51 }
52 
54 {
55  if(!s.empty()) {
58 
59  for(;itor != utf8::iterator::end(s); ++itor) {
61  // If wchar_t is less than 32 bits wide, we cannot apply towlower() to all codepoints
62  if(uchar <= static_cast<ucs4::char_t>(std::numeric_limits<wchar_t>::max()))
63  uchar = towlower(static_cast<wchar_t>(uchar));
65  }
66 
67  res.append(itor.substr().second, s.end());
68  return res;
69  }
70  return s;
71 }
72 
73 size_t index(const utf8::string& str, const size_t index)
74 {
75  // chr counts characters, i is the codepoint index
76  // remark: several functions rely on the fallback to str.length()
77  unsigned int i = 0, len = str.size();
78  try {
79  for (unsigned int chr=0; chr<index && i<len; ++chr) {
80  i += byte_size_from_utf8_first(str[i]);
81  }
82  } catch(invalid_utf8_exception&) {
83  ERR_GENERAL << "Invalid UTF-8 string." << std::endl;
84  }
85  return i;
86 }
87 
88 size_t size(const utf8::string& str)
89 {
90  unsigned int chr, i = 0, len = str.size();
91  try {
92  for (chr=0; i<len; ++chr) {
93  i += byte_size_from_utf8_first(str[i]);
94  }
95  } catch(invalid_utf8_exception&) {
96  ERR_GENERAL << "Invalid UTF-8 string." << std::endl;
97  }
98  return chr;
99 }
100 
102 {
103  return str.insert(index(str, pos), insert);
104 }
105 
106 utf8::string& erase(utf8::string& str, const size_t start, const size_t len)
107 {
108  if (start > size(str)) return str;
109  unsigned pos = index(str, start);
110 
111  if (len == std::string::npos) {
112  // without second argument, std::string::erase truncates
113  return str.erase(pos);
114  } else {
115  return str.erase(pos, index(str,start+len) - pos);
116  }
117 }
118 
120 {
121  return erase(str, size);
122 }
123 
124 void truncate_as_ucs4(utf8::string &str, const size_t size)
125 {
126  ucs4::string u4_str = unicode_cast<ucs4::string>(str);
127  if(u4_str.size() > size) {
128  u4_str.resize(size);
129  str = unicode_cast<utf8::string>(u4_str);
130  }
131 }
132 
133 } // end namespace utf8
std::vector< char_t > string
size_t index(const utf8::string &str, const size_t index)
Codepoint index corresponding to the nth character in a UTF-8 string.
Definition: unicode.cpp:73
unsigned int count_leading_ones(N n)
Returns the quantity of leading 1 bits in n — i.e., the quantity of bits in n, minus the 1-based bit...
Definition: util.hpp:415
int pos
Definition: formula.cpp:800
#define uchar(c)
Definition: lstrlib.cpp:32
void truncate_as_ucs4(utf8::string &str, const size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:124
ucs4_convert_impl::enableif< TD, typename TS::value_type >::type unicode_cast(const TS &source)
utf8::string lowercase(const utf8::string &s)
Returns a lowercased version of the string.
Definition: unicode.cpp:53
#define ERR_GENERAL
Definition: unicode.cpp:34
GLenum GLsizei len
Definition: glew.h:5662
GLuint start
Definition: glew.h:1221
static int byte_size_from_utf8_first(const unsigned char ch)
Definition: unicode.cpp:38
utf8::string & truncate(utf8::string &str, const size_t size)
Truncates a UTF-8 string to the specified number of characters.
Definition: unicode.cpp:119
Templates and utility-routines for strings and numbers.
GLuint GLuint GLsizei count
Definition: glew.h:1221
size_t size(const utf8::string &str)
Length in characters of a UTF-8 string.
Definition: unicode.cpp:88
GLuint res
Definition: glew.h:9258
Functions for converting Unicode wide-char strings to UTF-8 encoded strings, back and forth...
Definition: unicode.cpp:36
std::map< std::string, tfilter >::iterator itor
Definition: filter.cpp:199
Thrown by operations encountering invalid UTF-8 data.
GLuint index
Definition: glew.h:1782
boost::uint32_t char_t
size_t i
Definition: function.cpp:1057
static iterator_base end(const string_type &str)
GLsizeiptr size
Definition: glew.h:1649
static lg::log_domain log_engine("engine")
const std::pair< typename string_type::const_iterator, typename string_type::const_iterator > & substr() const
Standard logging facilities (interface).
utf8::string & insert(utf8::string &str, const size_t pos, const utf8::string &insert)
Insert a UTF-8 string at the specified position.
Definition: unicode.cpp:101
GLdouble s
Definition: glew.h:1358
std::string string
utf8::string & erase(utf8::string &str, const size_t start, const size_t len)
Erases a portion of a UTF-8 string.
Definition: unicode.cpp:106