Boost.Locale
index.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0.
5 // https://www.boost.org/LICENSE_1_0.txt
6 
7 #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
8 #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
9 
10 #include <boost/locale/boundary/boundary_point.hpp>
11 #include <boost/locale/boundary/facets.hpp>
12 #include <boost/locale/boundary/segment.hpp>
13 #include <boost/locale/boundary/types.hpp>
14 #include <boost/cstdint.hpp>
15 #include <boost/iterator/iterator_facade.hpp>
16 #include <algorithm>
17 #include <iterator>
18 #include <locale>
19 #include <memory>
20 #include <stdexcept>
21 #include <string>
22 #include <type_traits>
23 #include <vector>
24 
25 #ifdef BOOST_MSVC
26 # pragma warning(push)
27 # pragma warning(disable : 4275 4251 4231 4660)
28 #endif
29 
30 namespace boost { namespace locale { namespace boundary {
39 
41 
42  namespace detail {
43  template<typename Char>
44  const boundary_indexing<Char>& get_boundary_indexing(const std::locale& l)
45  {
46  using facet_type = boundary_indexing<Char>;
47  if(!std::has_facet<facet_type>(l))
48  throw std::runtime_error("Locale was generated without segmentation support!");
49  return std::use_facet<facet_type>(l);
50  }
51 
52  template<typename IteratorType,
53  typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category>
54  struct mapping_traits {
55  typedef typename std::iterator_traits<IteratorType>::value_type char_type;
56  static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l)
57  {
58  std::basic_string<char_type> str(b, e);
59  return get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size());
60  }
61  };
62 
63  template<typename CharType, typename SomeIteratorType>
64  struct linear_iterator_traits {
65  static constexpr bool is_linear =
66  std::is_same<SomeIteratorType, CharType*>::value || std::is_same<SomeIteratorType, const CharType*>::value
67  || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::iterator>::value
68  || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::const_iterator>::value
69  || std::is_same<SomeIteratorType, typename std::vector<CharType>::iterator>::value
70  || std::is_same<SomeIteratorType, typename std::vector<CharType>::const_iterator>::value;
71  };
72 
73  template<typename IteratorType>
74  struct mapping_traits<IteratorType, std::random_access_iterator_tag> {
75  typedef typename std::iterator_traits<IteratorType>::value_type char_type;
76 
77  static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l)
78  {
79  index_type result;
80 
81  // Optimize for most common cases
82  //
83  // C++11 requires that string is continuous in memory and all known
84  // string implementations do this because of c_str() support.
85 
86  if(linear_iterator_traits<char_type, IteratorType>::is_linear && b != e) {
87  const char_type* begin = &*b;
88  const char_type* end = begin + (e - b);
89  index_type tmp = get_boundary_indexing<char_type>(l).map(t, begin, end);
90  result.swap(tmp);
91  } else {
92  std::basic_string<char_type> str(b, e);
93  index_type tmp = get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size());
94  result.swap(tmp);
95  }
96  return result;
97  }
98  };
99 
100  template<typename BaseIterator>
101  class mapping {
102  public:
103  typedef BaseIterator base_iterator;
104  typedef typename std::iterator_traits<base_iterator>::value_type char_type;
105 
106  mapping(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc) :
107  index_(new index_type()), begin_(begin), end_(end)
108  {
109  index_type idx = detail::mapping_traits<base_iterator>::map(type, begin, end, loc);
110  index_->swap(idx);
111  }
112 
113  mapping() {}
114 
115  const index_type& index() const { return *index_; }
116 
117  base_iterator begin() const { return begin_; }
118 
119  base_iterator end() const { return end_; }
120 
121  private:
122  std::shared_ptr<index_type> index_;
123  base_iterator begin_, end_;
124  };
125 
126  template<typename BaseIterator>
127  class segment_index_iterator : public boost::iterator_facade<segment_index_iterator<BaseIterator>,
128  segment<BaseIterator>,
129  boost::bidirectional_traversal_tag,
130  const segment<BaseIterator>&> {
131  public:
132  typedef BaseIterator base_iterator;
133  typedef mapping<base_iterator> mapping_type;
134  typedef segment<base_iterator> segment_type;
135 
136  segment_index_iterator() : current_(0, 0), map_(0), mask_(0), full_select_(false) {}
137 
138  segment_index_iterator(base_iterator p, const mapping_type* map, rule_type mask, bool full_select) :
139  map_(map), mask_(mask), full_select_(full_select)
140  {
141  set(p);
142  }
143  segment_index_iterator(bool is_begin, const mapping_type* map, rule_type mask, bool full_select) :
144  map_(map), mask_(mask), full_select_(full_select)
145  {
146  if(is_begin)
147  set_begin();
148  else
149  set_end();
150  }
151 
152  const segment_type& dereference() const { return value_; }
153 
154  bool equal(const segment_index_iterator& other) const
155  {
156  return map_ == other.map_ && current_.second == other.current_.second;
157  }
158 
159  void increment()
160  {
161  std::pair<size_t, size_t> next = current_;
162  if(full_select_) {
163  next.first = next.second;
164  while(next.second < size()) {
165  next.second++;
166  if(valid_offset(next.second))
167  break;
168  }
169  if(next.second == size())
170  next.first = next.second - 1;
171  } else {
172  while(next.second < size()) {
173  next.first = next.second;
174  next.second++;
175  if(valid_offset(next.second))
176  break;
177  }
178  }
179  update_current(next);
180  }
181 
182  void decrement()
183  {
184  std::pair<size_t, size_t> next = current_;
185  if(full_select_) {
186  while(next.second > 1) {
187  next.second--;
188  if(valid_offset(next.second))
189  break;
190  }
191  next.first = next.second;
192  while(next.first > 0) {
193  next.first--;
194  if(valid_offset(next.first))
195  break;
196  }
197  } else {
198  while(next.second > 1) {
199  next.second--;
200  if(valid_offset(next.second))
201  break;
202  }
203  next.first = next.second - 1;
204  }
205  update_current(next);
206  }
207 
208  private:
209  void set_end()
210  {
211  current_.first = size() - 1;
212  current_.second = size();
213  value_ = segment_type(map_->end(), map_->end(), 0);
214  }
215  void set_begin()
216  {
217  current_.first = current_.second = 0;
218  value_ = segment_type(map_->begin(), map_->begin(), 0);
219  increment();
220  }
221 
222  void set(base_iterator p)
223  {
224  size_t dist = std::distance(map_->begin(), p);
225  index_type::const_iterator b = map_->index().begin(), e = map_->index().end();
226  index_type::const_iterator boundary_point = std::upper_bound(b, e, break_info(dist));
227  while(boundary_point != e && (boundary_point->rule & mask_) == 0)
228  boundary_point++;
229 
230  current_.first = current_.second = boundary_point - b;
231 
232  if(full_select_) {
233  while(current_.first > 0) {
234  current_.first--;
235  if(valid_offset(current_.first))
236  break;
237  }
238  } else {
239  if(current_.first > 0)
240  current_.first--;
241  }
242  value_.first = map_->begin();
243  std::advance(value_.first, get_offset(current_.first));
244  value_.second = value_.first;
245  std::advance(value_.second, get_offset(current_.second) - get_offset(current_.first));
246 
247  update_rule();
248  }
249 
250  void update_current(std::pair<size_t, size_t> pos)
251  {
252  std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first);
253  std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second);
254  std::advance(value_.first, first_diff);
255  std::advance(value_.second, second_diff);
256  current_ = pos;
257  update_rule();
258  }
259 
260  void update_rule()
261  {
262  if(current_.second != size()) {
263  value_.rule(index()[current_.second].rule);
264  }
265  }
266  size_t get_offset(size_t ind) const
267  {
268  if(ind == size())
269  return index().back().offset;
270  return index()[ind].offset;
271  }
272 
273  bool valid_offset(size_t offset) const
274  {
275  return offset == 0 || offset == size() // make sure we not acess index[size]
276  || (index()[offset].rule & mask_) != 0;
277  }
278 
279  size_t size() const { return index().size(); }
280 
281  const index_type& index() const { return map_->index(); }
282 
283  segment_type value_;
284  std::pair<size_t, size_t> current_;
285  const mapping_type* map_;
286  rule_type mask_;
287  bool full_select_;
288  };
289 
290  template<typename BaseIterator>
291  class boundary_point_index_iterator : public boost::iterator_facade<boundary_point_index_iterator<BaseIterator>,
292  boundary_point<BaseIterator>,
293  boost::bidirectional_traversal_tag,
294  const boundary_point<BaseIterator>&> {
295  public:
296  typedef BaseIterator base_iterator;
297  typedef mapping<base_iterator> mapping_type;
298  typedef boundary_point<base_iterator> boundary_point_type;
299 
300  boundary_point_index_iterator() : current_(0), map_(0), mask_(0) {}
301 
302  boundary_point_index_iterator(bool is_begin, const mapping_type* map, rule_type mask) :
303  map_(map), mask_(mask)
304  {
305  if(is_begin)
306  set_begin();
307  else
308  set_end();
309  }
310  boundary_point_index_iterator(base_iterator p, const mapping_type* map, rule_type mask) :
311  map_(map), mask_(mask)
312  {
313  set(p);
314  }
315 
316  const boundary_point_type& dereference() const { return value_; }
317 
318  bool equal(const boundary_point_index_iterator& other) const
319  {
320  return map_ == other.map_ && current_ == other.current_;
321  }
322 
323  void increment()
324  {
325  size_t next = current_;
326  while(next < size()) {
327  next++;
328  if(valid_offset(next))
329  break;
330  }
331  update_current(next);
332  }
333 
334  void decrement()
335  {
336  size_t next = current_;
337  while(next > 0) {
338  next--;
339  if(valid_offset(next))
340  break;
341  }
342  update_current(next);
343  }
344 
345  private:
346  void set_end()
347  {
348  current_ = size();
349  value_ = boundary_point_type(map_->end(), 0);
350  }
351  void set_begin()
352  {
353  current_ = 0;
354  value_ = boundary_point_type(map_->begin(), 0);
355  }
356 
357  void set(base_iterator p)
358  {
359  size_t dist = std::distance(map_->begin(), p);
360 
361  index_type::const_iterator b = index().begin();
362  index_type::const_iterator e = index().end();
363  index_type::const_iterator ptr = std::lower_bound(b, e, break_info(dist));
364 
365  if(ptr == index().end())
366  current_ = size() - 1;
367  else
368  current_ = ptr - index().begin();
369 
370  while(!valid_offset(current_))
371  current_++;
372 
373  std::ptrdiff_t diff = get_offset(current_) - dist;
374  std::advance(p, diff);
375  value_.iterator(p);
376  update_rule();
377  }
378 
379  void update_current(size_t pos)
380  {
381  std::ptrdiff_t diff = get_offset(pos) - get_offset(current_);
382  base_iterator i = value_.iterator();
383  std::advance(i, diff);
384  current_ = pos;
385  value_.iterator(i);
386  update_rule();
387  }
388 
389  void update_rule()
390  {
391  if(current_ != size()) {
392  value_.rule(index()[current_].rule);
393  }
394  }
395  size_t get_offset(size_t ind) const
396  {
397  if(ind == size())
398  return index().back().offset;
399  return index()[ind].offset;
400  }
401 
402  bool valid_offset(size_t offset) const
403  {
404  return offset == 0 || offset + 1 >= size() // last and first are always valid regardless of mark
405  || (index()[offset].rule & mask_) != 0;
406  }
407 
408  size_t size() const { return index().size(); }
409 
410  const index_type& index() const { return map_->index(); }
411 
412  boundary_point_type value_;
413  size_t current_;
414  const mapping_type* map_;
415  rule_type mask_;
416  };
417 
418  } // namespace detail
419 
421 
422  template<typename BaseIterator>
424 
425  template<typename BaseIterator>
427 
477 
478  template<typename BaseIterator>
479  class segment_index {
480  public:
482  typedef BaseIterator base_iterator;
483 
484 #ifdef BOOST_LOCALE_DOXYGEN
485  typedef unspecified_iterator_type iterator;
499  typedef unspecified_iterator_type const_iterator;
500 #else
501  typedef detail::segment_index_iterator<base_iterator> iterator;
502  typedef detail::segment_index_iterator<base_iterator> const_iterator;
503 #endif
507 
515  segment_index() : mask_(0xFFFFFFFFu), full_select_(false) {}
521  rule_type mask,
522  const std::locale& loc = std::locale()) :
523  map_(type, begin, end, loc),
524  mask_(mask), full_select_(false)
525  {}
531  const std::locale& loc = std::locale()) :
532  map_(type, begin, end, loc),
533  mask_(0xFFFFFFFFu), full_select_(false)
534  {}
535 
545 
555 
560  void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale())
561  {
562  map_ = mapping_type(type, begin, end, loc);
563  }
564 
572  iterator begin() const
573  {
574  return iterator(true, &map_, mask_, full_select_);
575  }
576 
582  iterator end() const
583  {
584  return iterator(false, &map_, mask_, full_select_);
585  }
586 
603  {
604  return iterator(p, &map_, mask_, full_select_);
605  }
606 
608  rule_type rule() const
609  {
610  return mask_;
611  }
613  void rule(rule_type v)
614  {
615  mask_ = v;
616  }
617 
628  bool full_select() const
629  {
630  return full_select_;
631  }
632 
643  void full_select(bool v)
644  {
645  full_select_ = v;
646  }
647 
648  private:
649  friend class boundary_point_index<base_iterator>;
650  typedef detail::mapping<base_iterator> mapping_type;
651  mapping_type map_;
652  rule_type mask_;
653  bool full_select_;
654  };
655 
700  template<typename BaseIterator>
701  class boundary_point_index {
702  public:
704  typedef BaseIterator base_iterator;
705 
706 #ifdef BOOST_LOCALE_DOXYGEN
707  typedef unspecified_iterator_type iterator;
722  typedef unspecified_iterator_type const_iterator;
723 #else
724  typedef detail::boundary_point_index_iterator<base_iterator> iterator;
725  typedef detail::boundary_point_index_iterator<base_iterator> const_iterator;
726 #endif
730 
738  boundary_point_index() : mask_(0xFFFFFFFFu) {}
739 
745  rule_type mask,
746  const std::locale& loc = std::locale()) :
747  map_(type, begin, end, loc),
748  mask_(mask)
749  {}
755  const std::locale& loc = std::locale()) :
756  map_(type, begin, end, loc),
757  mask_(0xFFFFFFFFu)
758  {}
759 
778 
783  void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale())
784  {
785  map_ = mapping_type(type, begin, end, loc);
786  }
787 
795  iterator begin() const
796  {
797  return iterator(true, &map_, mask_);
798  }
799 
807  iterator end() const
808  {
809  return iterator(false, &map_, mask_);
810  }
811 
824  {
825  return iterator(p, &map_, mask_);
826  }
827 
829  rule_type rule() const
830  {
831  return mask_;
832  }
834  void rule(rule_type v)
835  {
836  mask_ = v;
837  }
838 
839  private:
840  friend class segment_index<base_iterator>;
841  typedef detail::mapping<base_iterator> mapping_type;
842  mapping_type map_;
843  rule_type mask_;
844  };
845 
847  template<typename BaseIterator>
848  segment_index<BaseIterator>::segment_index(const boundary_point_index<BaseIterator>& other) :
849  map_(other.map_), mask_(0xFFFFFFFFu), full_select_(false)
850  {}
851 
852  template<typename BaseIterator>
853  boundary_point_index<BaseIterator>::boundary_point_index(const segment_index<BaseIterator>& other) :
854  map_(other.map_), mask_(0xFFFFFFFFu)
855  {}
856 
857  template<typename BaseIterator>
858  segment_index<BaseIterator>& segment_index<BaseIterator>::operator=(const boundary_point_index<BaseIterator>& other)
859  {
860  map_ = other.map_;
861  return *this;
862  }
863 
864  template<typename BaseIterator>
865  boundary_point_index<BaseIterator>&
866  boundary_point_index<BaseIterator>::operator=(const segment_index<BaseIterator>& other)
867  {
868  map_ = other.map_;
869  return *this;
870  }
872 
875 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
877 #endif
878 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
880 #endif
881 
884 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
886 #endif
887 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
889 #endif
890 
893 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
895 #endif
896 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
898 #endif
899 
902 #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
904 #endif
905 #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
907 #endif
908 
909 }}} // namespace boost::locale::boundary
910 
917 
918 #ifdef BOOST_MSVC
919 # pragma warning(pop)
920 #endif
921 
922 #endif
void full_select(bool v)
Definition: index.hpp:643
iterator find(base_iterator p) const
Definition: index.hpp:823
boundary_point_index< const char32_t * > u32cboundary_point_index
convenience typedef
Definition: index.hpp:906
a segment object that represents a pair of two iterators that define the range where this segment exi...
Definition: segment.hpp:90
boundary_type
This type describes a possible boundary analysis alternatives.
Definition: types.hpp:30
bool full_select() const
Definition: index.hpp:628
rule_type rule() const
Get the mask of rules that are used.
Definition: index.hpp:608
This class holds an index of boundary points and allows iterating over them.
Definition: index.hpp:426
BaseIterator base_iterator
The type of the iterator used to iterate over the original text.
Definition: index.hpp:482
boundary_point_index(boundary_type type, base_iterator begin, base_iterator end, rule_type mask, const std::locale &loc=std::locale())
Definition: index.hpp:742
segment_index< std::u16string::const_iterator > u16ssegment_index
convenience typedef
Definition: index.hpp:876
iterator begin() const
Definition: index.hpp:572
segment_index(boundary_type type, base_iterator begin, base_iterator end, const std::locale &loc=std::locale())
Definition: index.hpp:528
boundary_point_index< std::wstring::const_iterator > wsboundary_point_index
convenience typedef
Definition: index.hpp:892
iterator end() const
Definition: index.hpp:807
segment_index & operator=(const boundary_point_index< base_iterator > &)
segment_index(boundary_type type, base_iterator begin, base_iterator end, rule_type mask, const std::locale &loc=std::locale())
Definition: index.hpp:518
boundary_point< base_iterator > value_type
Definition: index.hpp:729
segment< base_iterator > value_type
Definition: index.hpp:506
void rule(rule_type v)
Set the mask of rules that are used.
Definition: index.hpp:834
boundary_point_index< const wchar_t * > wcboundary_point_index
convenience typedef
Definition: index.hpp:901
boundary_point_index< const char16_t * > u16cboundary_point_index
convenience typedef
Definition: index.hpp:903
boundary_point_index & operator=(const segment_index< base_iterator > &other)
uint32_t rule_type
Flags used with word boundary analysis – the type of the word, line or sentence boundary found.
Definition: types.hpp:40
segment_index< const wchar_t * > wcsegment_index
convenience typedef
Definition: index.hpp:883
unspecified_iterator_type iterator
Definition: index.hpp:497
segment_index()
Definition: index.hpp:515
iterator end() const
Definition: index.hpp:582
iterator begin() const
Definition: index.hpp:795
boundary_point_index< std::string::const_iterator > sboundary_point_index
convenience typedef
Definition: index.hpp:891
segment_index< std::string::const_iterator > ssegment_index
convenience typedef
Definition: index.hpp:873
segment_index< std::wstring::const_iterator > wssegment_index
convenience typedef
Definition: index.hpp:874
unspecified_iterator_type const_iterator
Definition: index.hpp:499
void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale &loc=std::locale())
Definition: index.hpp:783
unspecified_iterator_type const_iterator
Definition: index.hpp:722
This class represents a boundary point in the text.
Definition: boundary_point.hpp:44
rule_type rule() const
Get the mask of rules that are used.
Definition: index.hpp:829
void rule(rule_type v)
Set the mask of rules that are used.
Definition: index.hpp:613
boundary_point_index(boundary_type type, base_iterator begin, base_iterator end, const std::locale &loc=std::locale())
Definition: index.hpp:752
boundary_point_index< std::u32string::const_iterator > u32sboundary_point_index
convenience typedef
Definition: index.hpp:897
iterator find(base_iterator p) const
Definition: index.hpp:602
unspecified_iterator_type iterator
Definition: index.hpp:720
boundary_point_index< std::u16string::const_iterator > u16sboundary_point_index
convenience typedef
Definition: index.hpp:894
Generate boundary analysis facet.
segment_index< const char16_t * > u16csegment_index
convenience typedef
Definition: index.hpp:885
BaseIterator base_iterator
The type of the iterator used to iterate over the original text.
Definition: index.hpp:704
segment_index< const char32_t * > u32csegment_index
convenience typedef
Definition: index.hpp:888
segment_index< const char * > csegment_index
convenience typedef
Definition: index.hpp:882
segment_index< std::u32string::const_iterator > u32ssegment_index
convenience typedef
Definition: index.hpp:879
std::vector< break_info > index_type
Definition: facets.hpp:50
boundary_point_index< const char * > cboundary_point_index
convenience typedef
Definition: index.hpp:900
boundary_point_index()
Definition: index.hpp:738
This class holds an index of segments in the text range and allows to iterate over them.
Definition: index.hpp:423
void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale &loc=std::locale())
Definition: index.hpp:560