UCommon
unicode.h
Go to the documentation of this file.
1 // Copyright (C) 2009-2014 David Sugar, Tycho Softworks.
2 // Copyright (C) 2015 Cherokees of Idaho.
3 //
4 // This file is part of GNU uCommon C++.
5 //
6 // GNU uCommon C++ is free software: you can redistribute it and/or modify
7 // it under the terms of the GNU Lesser General Public License as published
8 // by the Free Software Foundation, either version 3 of the License, or
9 // (at your option) any later version.
10 //
11 // GNU uCommon C++ is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU Lesser General Public License for more details.
15 //
16 // You should have received a copy of the GNU Lesser General Public License
17 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
18 
33 #ifndef _UCOMMON_UNICODE_H_
34 #define _UCOMMON_UNICODE_H_
35 
36 #ifndef _UCOMMON_STRING_H_
37 #include <ucommon/string.h>
38 #endif
39 
40 #ifdef nil
41 #undef nil
42 #endif
43 
44 namespace ucommon {
45 
50 typedef int32_t ucs4_t;
51 
55 typedef int16_t ucs2_t;
56 
60 typedef void *unicode_t;
61 
67 class __EXPORT utf8
68 {
69 public:
73  static const unsigned ucsize;
74 
78  static const char *nil;
79 
85  static unsigned size(const char *codepoint);
86 
92  static size_t count(const char *string);
93 
100  static char *offset(char *string, ssize_t position);
101 
107  static ucs4_t codepoint(const char *encoded);
108 
114  static size_t chars(const unicode_t string);
115 
121  static size_t chars(ucs4_t character);
122 
129  static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
130 
138  static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
139 
143  static ucs4_t *udup(const char *string);
144 
148  static ucs2_t *wdup(const char *string);
149 
157  static const char *find(const char *string, ucs4_t character, size_t start = 0);
158 
166  static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
167 
174  static unsigned ccount(const char *string, ucs4_t character);
175 
181  static ucs4_t get(CharacterProtocol& buffer);
182 
189  static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
190 };
191 
198 class __EXPORT UString : public String, public utf8
199 {
200 protected:
204  UString();
205 
210  UString(strsize_t size);
211 
216  UString(const unicode_t text);
217 
224  UString(const char *text, strsize_t size);
225 
232  UString(const unicode_t *text, const unicode_t *end);
233 
239  UString(const UString& existing);
240 
245  virtual ~UString();
246 
253  UString get(strsize_t codepoint, strsize_t size = 0) const;
254 
261  size_t get(unicode_t unicode, size_t size) const;
262 
267  void set(const unicode_t unicode);
268 
273  void add(const unicode_t unicode);
274 
280  ucs4_t at(int position) const;
281 
288  inline size_t operator()(unicode_t unicode, size_t size) const
289  {return get(unicode, size);}
290 
297  UString operator()(int codepoint, strsize_t size) const;
298 
304  inline UString left(strsize_t size) const
305  {return operator()(0, size);}
306 
312  inline UString right(strsize_t offset) const
313  {return operator()(-((int)offset), 0);}
314 
321  inline UString copy(strsize_t offset, strsize_t size) const
322  {return operator()((int)offset, size);}
323 
329  void cut(strsize_t offset, strsize_t size = 0);
330 
337  void paste(strsize_t offset, const char *text, strsize_t size = 0);
338 
346  const char *operator()(int offset) const;
347 
353  inline ucs4_t operator[](int position) const
354  {return UString::at(position);}
355 
360  inline strsize_t count(void) const
361  {return (strsize_t)utf8::count(str->text);}
362 
368  unsigned ccount(ucs4_t character) const;
369 
376  const char *find(ucs4_t character, strsize_t start = 0) const;
377 
384  const char *rfind(ucs4_t character, strsize_t end = npos) const;
385 };
386 
392 class __EXPORT utf8_pointer
393 {
394 protected:
395  uint8_t *text;
396 
397 public:
401  utf8_pointer();
402 
407  utf8_pointer(const char *string);
408 
414 
419  utf8_pointer& operator ++();
420 
425  utf8_pointer& operator --();
426 
432  utf8_pointer& operator +=(long offset);
433 
439  utf8_pointer& operator -=(long offset);
440 
446  utf8_pointer operator+(long offset) const;
447 
453  utf8_pointer operator-(long offset) const;
454 
459  inline operator bool() const
460  {return text != NULL;}
461 
466  inline bool operator!() const
467  {return text == NULL;}
468 
474  ucs4_t operator[](long codepoint) const;
475 
481  utf8_pointer& operator=(const char *string);
482 
486  void inc(void);
487 
491  void dec(void);
492 
498  inline bool operator==(const char *string) const
499  {return (const char *)text == string;}
500 
506  inline bool operator!=(const char *string) const
507  {return (const char *)text != string;}
508 
513  inline ucs4_t operator*() const
514  {return utf8::codepoint((const char *)text);}
515 
520  inline char *c_str(void) const
521  {return (char *)text;}
522 
527  inline operator char*() const
528  {return (char *)text;}
529 
534  inline size_t len(void) const
535  {return utf8::count((const char *)text);}
536 };
537 
538 inline ucs4_t *strudup(const char *string)
539  {return utf8::udup(string);}
540 
541 inline ucs2_t *strwdup(const char *string)
542  {return utf8::wdup(string);}
543 
544 __EXPORT unicode_t unidup(const char *string);
545 
546 template<>
547 inline void dupfree<ucs2_t*>(ucs2_t *string)
548  {::free(string);}
549 
550 template<>
551 inline void dupfree<ucs4_t*>(ucs4_t *string)
552  {::free(string);}
553 
554 template<>
555 inline void dupfree<unicode_t>(unicode_t string)
556  {::free(string);}
557 
562 
567 
568 } // namespace ucommon
569 
570 #endif
static const char * nil
A convenient NULL pointer value.
Definition: unicode.h:78
A common string class and character string support functions.
A core class of ut8 encoded string functions.
Definition: unicode.h:67
ucs4_t operator*() const
Get unicode character pointed to by pointer.
Definition: unicode.h:513
UString right(strsize_t offset) const
Convenience method for right of string.
Definition: unicode.h:312
static size_t count(const char *string)
Count ut8 encoded ucs4 codepoints in string.
bool operator!=(const char *string) const
check if pointer does not equal another string.
Definition: unicode.h:506
bool operator==(const char *string) const
check if pointer equals another string.
Definition: unicode.h:498
int16_t ucs2_t
16 bit unicode character code.
Definition: unicode.h:55
UString copy(strsize_t offset, strsize_t size) const
Convenience method for substring extraction.
Definition: unicode.h:321
A copy-on-write utf8 string class that operates by reference count.
Definition: unicode.h:198
unsigned short strsize_t
A convenience class for size of strings.
Definition: string.h:71
ObjectProtocol * copy(ObjectProtocol *object)
Convenience function to access object copy.
Definition: object.h:510
A copy-on-write string class that operates by reference count.
Definition: string.h:83
ucs4_t at(int position) const
Return unicode character found at a specific codepoint in the string.
int32_t ucs4_t
32 bit unicode character code.
Definition: unicode.h:50
static ucs4_t * udup(const char *string)
Dup a utf8 string into a ucs4_t string.
UString ustring_t
Convenience type for utf8 encoded strings.
Definition: unicode.h:561
static const unsigned ucsize
Size of "unicode_t" character codes, may not be ucs4_t size.
Definition: unicode.h:73
void * unicode_t
Resolves issues where wchar_t is not defined.
Definition: unicode.h:60
ucs4_t operator[](int position) const
Reference a unicode character in string object by array offset.
Definition: unicode.h:353
Common namespace for all ucommon objects.
Definition: access.h:47
static ucs4_t codepoint(const char *encoded)
Convert a utf8 encoded codepoint to a ucs4 character value.
size_t operator()(unicode_t unicode, size_t size) const
Extract a unicode byte sequence from utf8 object.
Definition: unicode.h:288
Pointer to utf8 encoded character data.
Definition: unicode.h:392
bool operator!() const
Check if text is an invalid pointer.
Definition: unicode.h:466
static ucs2_t * wdup(const char *string)
Dup a utf8 string into a ucs2_t representation.
utf8_pointer utf8_t
Convenience type for utf8_pointer strings.
Definition: unicode.h:566
size_t len(void) const
Get length of null terminated utf8 string in codepoints.
Definition: unicode.h:534
void start(JoinableThread *thread, int priority=0)
Convenience function to start a joinable thread.
Definition: thread.h:1866
char * c_str(void) const
Get c string we point to.
Definition: unicode.h:520
UString left(strsize_t size) const
Convenience method for left of string.
Definition: unicode.h:304
strsize_t count(void) const
Count codepoints in current string.
Definition: unicode.h:360
Common character processing protocol.
Definition: protocols.h:175