Dillo v3.1.1-14-g8f67d6e0
Loading...
Searching...
No Matches
hyphenator.cc
Go to the documentation of this file.
1/*
2 * Dillo Widget
3 *
4 * Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
5 * Johannes Hofmann <Johannes.Hofmann@gmx.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21
22#include "hyphenator.hh"
23
24#include "../lout/misc.hh"
25#include "../lout/unicode.hh"
26#include <limits.h>
27#include <stdio.h>
28#include <string.h>
29
30#define LEN 1000
31
32/*
33 * This is (or at least began as) a direct translation of Ned Batchelder's
34 * public-domain Python implementation at
35 * http://nedbatchelder.com/code/modules/hyphenate.py
36 */
37
38using namespace lout::object;
39using namespace lout::container::typed;
40using namespace lout::misc;
41using namespace lout::unicode;
42
43namespace dw {
44
45HashTable <String, Hyphenator> *Hyphenator::hyphenators =
46 new HashTable <String, Hyphenator> (true, true);
47
48Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
49{
50 trie = NULL; // As long we are not sure whether a pattern file can be read.
51
52 int bufLen = strlen (patFile) + 5 + 1;
53 char *buf = new char[bufLen];
54 snprintf(buf, bufLen, "%s.trie", patFile);
55 FILE *trieF = fopen (buf, "r");
56 delete[] buf;
57
58 if (trieF) {
59 trie = new Trie ();
60 if (trie->load (trieF) != 0) {
61 delete trie;
62 trie = NULL;
63 }
64 fclose (trieF);
65 }
66
67 if (trie == NULL) {
68 TrieBuilder trieBuilder(pack);
69 FILE *patF = fopen (patFile, "r");
70 if (patF) {
71
72 while (!feof (patF)) {
73 char buf[LEN + 1];
74 char *s = fgets (buf, LEN, patF);
75 if (s && s[0] != '%') { // ignore lines starting with '%' as comment
76 // TODO Better exit with an error, when the line is too long.
77 int l = strlen (s);
78 if (s[l - 1] == '\n')
79 s[l - 1] = 0;
80 insertPattern (&trieBuilder, s);
81 }
82 }
83
84 trie = trieBuilder.createTrie ();
85 fclose (patF);
86 }
87 }
88
89 exceptions = NULL; // Again, only instantiated when needed.
90
91 FILE *excF = fopen (excFile, "r");
92 if (excF) {
93 exceptions = new HashTable <ConstString, Vector <Integer> > (true, true);
94 while (!feof (excF)) {
95 char buf[LEN + 1];
96 char *s = fgets (buf, LEN, excF);
97 if (s && s[0] != '%') { // ignore lines starting with '%' as comment
98 // TODO Better exit with an error, when the line is too long.
99 int l = strlen (s);
100 if (s[l - 1] == '\n')
101 s[l - 1] = 0;
102 insertException (s);
103 }
104 }
105 fclose (excF);
106 }
107}
108
110{
111 delete trie;
112 delete exceptions;
113}
114
116{
117 String *langString = new String (lang);
118
119 Hyphenator *hyphenator = hyphenators->get (langString);
120 if (hyphenator)
121 delete langString;
122 else {
123 int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
124 char *patFile = new char[patFileLen];
125 snprintf (patFile, patFileLen, "%s/hyphenation/%s.pat",
126 DILLO_LIBDIR, lang);
127 int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
128 char *excFile = new char[excFileLen];
129 snprintf (excFile, excFileLen, "%s/hyphenation/%s.exc",
130 DILLO_LIBDIR, lang);
131
132 //printf ("Loading hyphenation patterns for language '%s' from '%s' and "
133 // "exceptions from '%s' ...\n", lang, patFile, excFile);
134
135 hyphenator = new Hyphenator (patFile, excFile);
136 hyphenators->put (langString, hyphenator);
137 delete[] patFile;
138 delete[] excFile;
139 }
140
141 //lout::misc::StringBuffer sb;
142 //hyphenators->intoStringBuffer (&sb);
143 //printf ("%s\n", sb.getChars ());
144
145 return hyphenator;
146}
147
148void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
149{
150 // Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
151 // and a list of points [ 0, 1, 0, 3, 4 ].
152 int l = strlen (s);
153 char *chars = new char[l + 1];
154 SimpleVector<char> points (1);
155
156 // TODO numbers consisting of multiple digits?
157 // TODO Encoding: This implementation works exactly like the Python
158 // implementation, based on UTF-8. Does this always work?
159 int numChars = 0;
160 for (int i = 0; s[i]; i++) {
161 if (s[i] >= '0' && s[i] <= '9') {
162 points.setSize(numChars + 1, '0');
163 points.set(numChars, s[i]);
164 } else {
165 chars[numChars++] = s[i];
166 }
167 }
168 chars[numChars] = 0;
169
170 points.setSize(numChars + 2, '0');
171 points.set(numChars + 1, '\0');
172
173 // Insert the pattern into the tree. Each character finds a dict
174 // another level down in the tree, and leaf nodes have the list of
175 // points.
176
177 //printf("insertPattern %s\n", chars);
178
179 trieBuilder->insert (chars, points.getArray ());
180 delete[] chars;
181}
182
184{
185 Vector<Integer> *breaks = new Vector<Integer> (1, true);
186
187 int len = strlen (s);
188 for (int i = 0; i < len - 1; i++)
189 if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
190 breaks->put (new Integer (i - 2 * breaks->size()));
191
192 char *noHyphens = new char[len - 2 * breaks->size() + 1];
193 int j = 0;
194 for (int i = 0; i < len; ) {
195 if(i < len - 1 &&
196 (unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
197 i += 2;
198 else
199 noHyphens[j++] = s[i++];
200 }
201 noHyphens[j] = 0;
202
203 exceptions->put (new String (noHyphens), breaks);
204 delete[] noHyphens;
205}
206
212{
213 // Short words aren't hyphenated.
214 return (strlen (word) > 4); // TODO UTF-8?
215}
216
226{
227 return isAlpha (decodeUtf8 (s));
228}
229
234 const char *word, int *numBreaks)
235{
236 if ((trie == NULL && exceptions == NULL) || !isHyphenationCandidate (word)) {
237 *numBreaks = 0;
238 return NULL;
239 }
240
241 char *wordLc = platform->textToLower (word, strlen (word));
242
243 int start = 0;
244 SimpleVector <int> breakPos (1);
245
246 // Split the original word up, ignore anything but characters, and
247 // collect all break points, so that they fit to the original
248 // word. (The latter is what the offset in the call of
249 // hyphenateSingleWord() is for.)
250 while (true) {
251 while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
252 start = platform->nextGlyph (wordLc, start);
253
254 if (wordLc[start] == 0)
255 break;
256
257 int end = start, i = end;
258 while (wordLc[i]) {
259 if (!isCharPartOfActualWord (wordLc + i))
260 break;
261 else
262 end = i;
263 i = platform->nextGlyph (wordLc, i);
264 }
265 end = platform->nextGlyph (wordLc, end);
266
267 int nextStart;
268 if (wordLc[end]) {
269 nextStart = platform->nextGlyph (wordLc, end);
270 wordLc[end] = 0;
271 } else
272 nextStart = end;
273
274 hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
275 start = nextStart;
276 }
277
278 free (wordLc);
279
280 *numBreaks = breakPos.size ();
281 if (*numBreaks == 0)
282 return NULL;
283 else {
284 return breakPos.detachArray ();
285 }
286}
287
293 char *wordLc, int offset,
294 SimpleVector <int> *breakPos)
295{
296 // If the word is an exception, get the stored points.
297 Vector <Integer> *exceptionalBreaks;
298 ConstString key (wordLc);
299 if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
300 for (int i = 0; i < exceptionalBreaks->size(); i++) {
301 breakPos->increase ();
302 breakPos->set (breakPos->size() - 1,
303 exceptionalBreaks->get(i)->getValue() + offset);
304 }
305 return;
306 }
307
308
309 // trie == NULL means that there is no pattern file.
310 if (trie == NULL)
311 return;
312
313 char *work = new char[strlen (wordLc) + 3];
314 strcpy (work, ".");
315 strcat (work, wordLc);
316 strcat (work, ".");
317
318 int l = strlen (work);
319 SimpleVector <int> points (l + 1);
320 points.setSize (l + 1, 0);
321
322 for (int i = 0; i < l; i++) {
323 int state = trie->root;
324
325 for (int j = i; j < l && trie->validState (state); j++) {
326 const char *p = trie->getData((unsigned char) work[j], &state);
327
328 if (p) {
329 for (int k = 0; p[k]; k++)
330 points.set(i + k,
331 lout::misc::max (points.get (i + k), p[k] - '0'));
332 }
333 }
334 }
335
336 delete[] work;
337
338 // No hyphens in the first two chars or the last two.
339 // Characters are not bytes, so UTF-8 characters must be counted.
340 const char *s = nextUtf8Char (wordLc);
341 if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
342 // First two characters.
343 int bytesStart = s - wordLc;
344 for (int i = 0; i < bytesStart; i++)
345 points.set (i + 1, 0);
346
347 // Last two characters: instead of iterating back from the end,
348 // we simply iterate from the start to the end and count the
349 // characters.
350
351 int lenBytes = strlen (wordLc);
352 int lenUtf8 = numUtf8Chars (wordLc);
353 int bytesEnd = 0;
354
355 s = wordLc;
356 for (int i = 0; s; s = nextUtf8Char (s), i++) {
357 if (i == lenUtf8 - 2)
358 bytesEnd = lenBytes - (s - wordLc);
359 }
360
361 for (int i = 0; i < bytesEnd; i++)
362 points.set (points.size() - 2 - i, 0);
363 }
364
365 // Examine the points to build the break point list.
366 int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2);
367 for (int i = 0; i < n; i++) {
368 if (points.get(i + 2) % 2) {
369 breakPos->increase ();
370 breakPos->set (breakPos->size() - 1, i + 1 + offset);
371 }
372 }
373}
374
376
378{
379 this->pack = pack;
380 dataList = new SimpleVector <DataEntry> (10000);
381 stateStack = new SimpleVector <StackEntry> (10);
382 tree = new SimpleVector <Trie::TrieNode> (20000);
383 dataZone = new ZoneAllocator (1024);
385}
386
388{
389 delete dataList;
390 delete stateStack;
391 delete tree;
392 delete dataZone;
393}
394
395void TrieBuilder::insert (const char *key, const char *value)
396{
397 dataList->increase ();
398 dataList->getLastRef ()->key = (unsigned char *) strdup(key);
399 dataList->getLastRef ()->value = dataZone->strdup (value);
400}
401
402int TrieBuilder::keyCompare (const void *p1, const void *p2)
403{
404 DataEntry *pd1 = (DataEntry *) p1;
405 DataEntry *pd2 = (DataEntry *) p2;
406
407 return strcmp ((char *) pd1->key, (char *) pd2->key);
408}
409
411{
412 int i, j;
413
414 if (state->count == 0)
415 return 0;
416
417 if (root) {
418 i = 0; // we reseve slot 0 for the root state
419 } else {
420 /* The bigger pack is the more slots we check and the smaller
421 * the trie will be, but CPU consumption also increases.
422 * Reasonable values for pack seemt to be between 256 and 1024.
423 */
424 i = tree->size () - pack + 2 * state->count;
425
426 if (i < 256) // reserve first 256 entries for the root state
427 i = 256;
428 }
429
430 for (;; i++) {
431 if (i + 256 > tree->size ())
432 tree->setSize (i + 256, trieNodeNull);
433
434 for (j = 1; j < 256; j++) {
435 Trie::TrieNode *tn = tree->getRef(i + j);
436
437 if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
438 break;
439 }
440
441 if (j == 256) // found a suitable slot
442 break;
443 }
444
445 for (int j = 1; j < 256; j++) {
446 Trie::TrieNode *tn = tree->getRef(i + j);
447
448 if (state->next[j] || state->data[j]) {
449 tn->c = j;
450 tn->next = state->next[j];
451 tn->data = state->data[j];
452 }
453 }
454
455 assert (root || i >= 256);
456 assert (!root || i == 0);
457 return i;
458}
459
460void TrieBuilder::stateStackPush (unsigned char c)
461{
462 stateStack->increase ();
463 StackEntry *e = stateStack->getLastRef ();
464 memset (e, 0, sizeof (StackEntry));
465 e->c = c;
466}
467
469{
470 int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
471 unsigned char c = stateStack->getLastRef ()->c;
472 const char *data = stateStack->getLastRef ()->data1;
473
474 stateStack->setSize (stateStack->size () - 1);
475
476 if (stateStack->size () > 0) {
477 assert (stateStack->getLastRef ()->next[c] == 0);
478 assert (stateStack->getLastRef ()->data[c] == NULL);
479 stateStack->getLastRef ()->next[c] = next;
480 stateStack->getLastRef ()->data[c] = data;
481 stateStack->getLastRef ()->count++;
482 }
483
484 return next;
485}
486
488{
489 // we need to sort the patterns as byte strings not as unicode
490 qsort (dataList->getArray (), dataList->size (),
491 sizeof (DataEntry), keyCompare);
492
493 for (int i = 0; i < dataList->size (); i++) {
494 insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
495 free (dataList->getRef (i)->key);
496 }
497
498 while (stateStack->size ())
499 stateStackPop ();
500
501 int size = tree->size ();
502 Trie *trie = new Trie(tree->detachArray(), size, true, dataZone);
503 dataZone = NULL;
504 return trie;
505}
506
507void TrieBuilder::insertSorted (unsigned char *s, const char *data)
508{
509 int len = strlen((char*)s);
510
511 for (int i = 0; i < len; i++) {
512 if (stateStack->size () > i + 1 &&
513 stateStack->getRef (i + 1)->c != s[i]) {
514 for (int j = stateStack->size () - 1; j >= i + 1; j--)
516 }
517
518 if (i + 1 >= stateStack->size ())
519 stateStackPush(s[i]);
520 }
521
522 while (stateStack->size () > len + 1)
524
525 assert (stateStack->size () == len + 1);
526 stateStack->getLastRef ()->data1 = data;
527}
528
529Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
530{
531 this->array = array;
532 this->size = size;
533 this->freeArray = freeArray;
534 this->dataZone = dataZone;
535}
536
538{
539 delete dataZone;
540 if (freeArray)
541 free(array);
542}
543
544void Trie::save (FILE *file)
545{
546 for (int i = 0; i < size; i++) {
547 Trie::TrieNode *tn = &array[i];
548
549 if (tn->data)
550 fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
551 else
552 fprintf(file, "%u, %u\n", tn->c, tn->next);
553 }
554}
555
556int Trie::load (FILE *file)
557{
558 int next, c, maxNext = 0;
559 SimpleVector <TrieNode> tree (100);
560 dataZone = new ZoneAllocator (1024);
561
562 while (!feof (file)) {
563 char buf[LEN + 1];
564 char *s = fgets (buf, LEN, file);
565
566 if (!s)
567 continue;
568
569 char data[LEN + 1];
570 int n = sscanf (s, "%d, %d, %s", &c, &next, data);
571
572 if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
573 tree.increase ();
574 tree.getLastRef ()->c = c;
575 tree.getLastRef ()->next = next;
576 if (n >= 3)
577 tree.getLastRef ()->data = dataZone->strdup (data);
578 else
579 tree.getLastRef ()->data = NULL;
580
581 if (next > maxNext)
582 maxNext = next;
583 } else {
584 goto error;
585 }
586 }
587
588 if (maxNext >= tree.size ())
589 goto error;
590
591 size = tree.size ();
592 array = tree.detachArray ();
593 freeArray = true;
594 return 0;
595
596error:
597 delete dataZone;
598 dataZone = NULL;
599 return 1;
600}
601
602} // namespace dw
void insertException(char *s)
bool isCharPartOfActualWord(char *s)
Test whether the character on which "s" points (UTF-8) is an actual part of the word.
void hyphenateSingleWord(core::Platform *platform, char *wordLc, int offset, lout::misc::SimpleVector< int > *breakPos)
Hyphenate a single word, which only consists of lowercase characters.
static lout::container::typed::HashTable< lout::object::String, Hyphenator > * hyphenators
Definition hyphenator.hh:89
static Hyphenator * getHyphenator(const char *language)
int * hyphenateWord(core::Platform *platform, const char *word, int *numBreaks)
Given a word, returns a list of the possible hyphenation points.
static bool isHyphenationCandidate(const char *word)
Simple test to avoid much costs.
lout::container::typed::HashTable< lout::object::ConstString, lout::container::typed::Vector< lout::object::Integer > > * exceptions
Definition hyphenator.hh:94
Hyphenator(const char *patFile, const char *excFile, int pack=256)
Definition hyphenator.cc:48
void insertPattern(TrieBuilder *trieBuilder, char *s)
void insertSorted(unsigned char *key, const char *value)
lout::misc::ZoneAllocator * dataZone
Definition hyphenator.hh:70
static int keyCompare(const void *p1, const void *p2)
int insertState(StackEntry *state, bool root)
TrieBuilder(int pack)
lout::misc::SimpleVector< Trie::TrieNode > * tree
Definition hyphenator.hh:67
void stateStackPush(unsigned char c)
lout::misc::SimpleVector< StackEntry > * stateStack
Definition hyphenator.hh:69
lout::misc::SimpleVector< DataEntry > * dataList
Definition hyphenator.hh:68
Trie * createTrie()
void insert(const char *key, const char *value)
static Trie::TrieNode trieNodeNull
Definition hyphenator.hh:66
bool freeArray
Definition hyphenator.hh:21
void save(FILE *file)
TrieNode * array
Definition hyphenator.hh:19
bool validState(int state)
Definition hyphenator.hh:30
lout::misc::ZoneAllocator * dataZone
Definition hyphenator.hh:22
const char * getData(unsigned char c, int *state)
Definition hyphenator.hh:31
Trie(TrieNode *array=NULL, int size=0, bool freeArray=false, lout::misc::ZoneAllocator *dataZone=NULL)
int load(FILE *file)
static const int root
Definition hyphenator.hh:29
An interface to encapsulate some platform dependencies.
Definition platform.hh:17
void put(K *key, V *value)
Definition container.hh:542
Typed version of container::untyped::Vector.
Definition container.hh:447
void put(T *newElement, int newPos=-1)
Definition container.hh:452
Simple (simpler than container::untyped::Vector and container::typed::Vector) template based vector.
Definition misc.hh:94
void setSize(int newSize)
Set the size explicitly.
Definition misc.hh:167
void increase()
Increase the vector size by one.
Definition misc.hh:160
T * getArray() const
Definition misc.hh:145
void set(int i, T t)
Store an object in the vector.
Definition misc.hh:246
T get(int i) const
Return the one element, explicitly.
Definition misc.hh:201
int size() const
Return the number of elements put into this vector.
Definition misc.hh:141
T * getLastRef() const
Return the reference of the last element (convenience method).
Definition misc.hh:225
A simple allocator optimized to handle many small chunks of memory.
Definition misc.hh:627
const char * strdup(const char *str)
Definition misc.hh:685
An object::Object wrapper for constant strings (char*).
Definition object.hh:163
An object::Object wrapper for int's.
Definition object.hh:127
An object::Object wrapper for strings (char*).
Definition object.hh:186
static void error(char *msg)
Definition dpidc.c:38
static FltkPlatform * platform
#define LEN
Definition hyphenator.cc:30
Dw is in this namespace, or sub namespaces of this one.
This namespace provides thin wrappers, implemented as C++ templates, to gain type-safety.
Definition container.hh:387
Miscellaneous stuff, which does not fit anywhere else.
Definition misc.cc:31
T min(T a, T b)
Definition misc.hh:19
T max(T a, T b)
Definition misc.hh:20
Here, some common classes (or interfaces) are defined, to standardize the access to other classes.
Definition object.cc:29
Stuff dealing with Unicode characters: UTF-8, character classes etc.
Definition unicode.cc:28
int numUtf8Chars(const char *s)
Definition unicode.cc:162
int decodeUtf8(const char *s)
Definition unicode.cc:73
bool isAlpha(int ch)
Returns whether a given unicode character is an alphabetic character.
Definition unicode.cc:68
const char * nextUtf8Char(const char *s)
Definition unicode.cc:110
const char * data[256]
Definition hyphenator.hh:56
unsigned char c
Definition hyphenator.hh:13
const char * data
Definition hyphenator.hh:15