Dillo v3.1.1-120-g540bad94
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
unicode.cc
Go to the documentation of this file.
1/*
2 * Dillo Widget
3 *
4 * Copyright 2012, 2013 Sebastian Geerken <sgeerken@dillo.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20
21#include "unicode.hh"
22#include "misc.hh"
23
24using namespace lout::misc;
25
26namespace lout {
27
28namespace unicode {
29
30static unsigned char alpha[0x500] = {
31 // 0000-007F: C0 Controls and Basic Latin
32 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
33 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07,
34 // 0080-00FF: C1 Controls and Latin-1 Supplement
35 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
36 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff, 0x7f, 0xff,
37 // 0100-017F: Latin Extended-A
38 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
39 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
40 // 0180-024F: Latin Extended-B
41 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
42 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
43 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
44 0xff, 0xff,
45 // 0250–02AF: IPA Extensions
46 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
47 0xff, 0xff, 0xff, 0xff,
48 // 02B0–02FF: Spacing Modifier Letters
49 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
50 0x00, 0x00,
51 // 0300–036F: Combining Diacritical Marks
52 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
53 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
54 // 0370–03FF: Greek and Coptic
55 0xcf, 0x00, 0x40, 0x7d, 0xff, 0xff, 0xfb, 0xff,
56 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
57 0xff, 0xff, 0xff, 0xff,
58 // 0400–04FF: Cyrillic
59 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
60 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
61 0x03, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
62 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
63};
64
68bool isAlpha (int ch)
69{
70 return ch < 0x500 && (alpha[ch / 8] & (1 << (ch & 7)));
71}
72
73int decodeUtf8 (const char *s)
74{
75 if((s[0] & 0x80) == 0)
76 return s[0];
77 else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
78 return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
79 else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
80 && (s[2] & 0xc0) == 0x80)
81 return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
82 else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
83 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
84 return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
85 | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
86 else
87 // Treat as ISO-8859-1 / ISO-8859-15 / Windows-1252
88 return s[0];
89}
90
91
92int decodeUtf8 (const char *s, int len)
93{
94 if(len >= 1 && (s[0] & 0x80) == 0)
95 return s[0];
96 else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
97 return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
98 else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
99 && (s[2] & 0xc0) == 0x80)
100 return ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
101 else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
102 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
103 return ((s[0] & 0x0f) << 18) | ((s[1] & 0x3f) << 12)
104 | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
105 else
106 // Treat as ISO-8859-1 / ISO-8859-15 / Windows-1252
107 return s[0];
108}
109
110const char *nextUtf8Char (const char *s)
111{
112 const char *r;
113
114 if (s == NULL || s[0] == 0)
115 r = NULL;
116 else if((s[0] & 0x80) == 0)
117 r = s + 1;
118 else if((s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
119 r = s + 2;
120 else if((s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
121 && (s[2] & 0xc0) == 0x80)
122 r = s + 3;
123 else if((s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
124 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
125 r = s + 4;
126 else
127 // invalid UTF-8 sequence: treat as one byte.
128 r = s + 1;
129
130 if (r && r[0] == 0)
131 return NULL;
132 else
133 return r;
134}
135
136const char *nextUtf8Char (const char *s, int len)
137{
138 const char *r;
139
140 if (s == NULL || len <= 0)
141 r = NULL;
142 else if(len >= 1 && (s[0] & 0x80) == 0)
143 r = s + 1;
144 else if(len >= 2 && (s[0] & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80)
145 r = s + 2;
146 else if(len >= 3 && (s[0] & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80
147 && (s[2] & 0xc0) == 0x80)
148 r = s + 3;
149 else if(len >= 4 && (s[0] & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80
150 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80)
151 r = s + 4;
152 else
153 // invalid UTF-8 sequence: treat as one byte.
154 r = s + 1;
155
156 if (r && r - s >= len)
157 return NULL;
158 else
159 return r;
160}
161
162int numUtf8Chars (const char *s)
163{
164 int numUtf8 = 0;
165 for (const char *r = s; r; r = nextUtf8Char (r))
166 numUtf8++;
167 return numUtf8;
168}
169
170int numUtf8Chars (const char *s, int len)
171{
172 int numUtf8 = 0;
173 for (const char *r = s; len > 0 && r; r = nextUtf8Char (r, len))
174 numUtf8++;
175 return numUtf8;
176}
177
178} // namespace lout
179
180} // namespace unicode
Miscellaneous stuff, which does not fit anywhere else.
Definition misc.cc:31
static unsigned char alpha[0x500]
Definition unicode.cc:30
int numUtf8Chars(const char *s)
Definition unicode.cc:162
int decodeUtf8(const char *s)
Definition unicode.cc:73
bool isAlpha(int ch)
Returns whether a given unicode character is an alphabetic character.
Definition unicode.cc:68
const char * nextUtf8Char(const char *s)
Definition unicode.cc:110