Dillo v3.1.1-98-g318d1f14
Loading...
Searching...
No Matches
hyphenator.cc
Go to the documentation of this file.
1/*
2 * Dillo Widget
3 *
4 * Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
5 * Johannes Hofmann <Johannes.Hofmann@gmx.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21
22#include "hyphenator.hh"
23
24#include "dlib/dlib.h"
25#include "../lout/misc.hh"
26#include "../lout/unicode.hh"
27#include <limits.h>
28#include <stdio.h>
29#include <string.h>
30
31#define LEN 1000
32
33/*
34 * This is (or at least began as) a direct translation of Ned Batchelder's
35 * public-domain Python implementation at
36 * http://nedbatchelder.com/code/modules/hyphenate.py
37 */
38
39using namespace lout::object;
40using namespace lout::container::typed;
41using namespace lout::misc;
42using namespace lout::unicode;
43
44namespace dw {
45
46HashTable <String, Hyphenator> *Hyphenator::hyphenators =
47 new HashTable <String, Hyphenator> (true, true);
48
49Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
50{
51 trie = NULL; // As long we are not sure whether a pattern file can be read.
52
53 int bufLen = strlen (patFile) + 5 + 1;
54 char *buf = new char[bufLen];
55 snprintf(buf, bufLen, "%s.trie", patFile);
56 FILE *trieF = fopen (buf, "r");
57 delete[] buf;
58
59 if (trieF) {
60 trie = new Trie ();
61 if (trie->load (trieF) != 0) {
62 delete trie;
63 trie = NULL;
64 }
65 fclose (trieF);
66 }
67
68 if (trie == NULL) {
69 TrieBuilder trieBuilder(pack);
70 FILE *patF = fopen (patFile, "r");
71 if (patF) {
72
73 while (!feof (patF)) {
74 char buf[LEN + 1];
75 char *s = fgets (buf, LEN, patF);
76 if (s && s[0] != '%') { // ignore lines starting with '%' as comment
77 // TODO Better exit with an error, when the line is too long.
78 int l = strlen (s);
79 if (s[l - 1] == '\n')
80 s[l - 1] = 0;
81 insertPattern (&trieBuilder, s);
82 }
83 }
84
85 trie = trieBuilder.createTrie ();
86 fclose (patF);
87 }
88 }
89
90 exceptions = NULL; // Again, only instantiated when needed.
91
92 FILE *excF = fopen (excFile, "r");
93 if (excF) {
94 exceptions = new HashTable <ConstString, Vector <Integer> > (true, true);
95 while (!feof (excF)) {
96 char buf[LEN + 1];
97 char *s = fgets (buf, LEN, excF);
98 if (s && s[0] != '%') { // ignore lines starting with '%' as comment
99 // TODO Better exit with an error, when the line is too long.
100 int l = strlen (s);
101 if (s[l - 1] == '\n')
102 s[l - 1] = 0;
103 insertException (s);
104 }
105 }
106 fclose (excF);
107 }
108}
109
111{
112 delete trie;
113 delete exceptions;
114}
115
117{
118 String *langString = new String (lang);
119
120 Hyphenator *hyphenator = hyphenators->get (langString);
121 if (hyphenator)
122 delete langString;
123 else {
124 int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
125 char *patFile = new char[patFileLen];
126 snprintf (patFile, patFileLen, "%s/hyphenation/%s.pat",
127 DILLO_LIBDIR, lang);
128 int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
129 char *excFile = new char[excFileLen];
130 snprintf (excFile, excFileLen, "%s/hyphenation/%s.exc",
131 DILLO_LIBDIR, lang);
132
133 //printf ("Loading hyphenation patterns for language '%s' from '%s' and "
134 // "exceptions from '%s' ...\n", lang, patFile, excFile);
135
136 hyphenator = new Hyphenator (patFile, excFile);
137 hyphenators->put (langString, hyphenator);
138 delete[] patFile;
139 delete[] excFile;
140 }
141
142 //lout::misc::StringBuffer sb;
143 //hyphenators->intoStringBuffer (&sb);
144 //printf ("%s\n", sb.getChars ());
145
146 return hyphenator;
147}
148
149void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
150{
151 // Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
152 // and a list of points [ 0, 1, 0, 3, 4 ].
153 int l = strlen (s);
154 char *chars = new char[l + 1];
155 SimpleVector<char> points (1);
156
157 // TODO numbers consisting of multiple digits?
158 // TODO Encoding: This implementation works exactly like the Python
159 // implementation, based on UTF-8. Does this always work?
160 int numChars = 0;
161 for (int i = 0; s[i]; i++) {
162 if (s[i] >= '0' && s[i] <= '9') {
163 points.setSize(numChars + 1, '0');
164 points.set(numChars, s[i]);
165 } else {
166 chars[numChars++] = s[i];
167 }
168 }
169 chars[numChars] = 0;
170
171 points.setSize(numChars + 2, '0');
172 points.set(numChars + 1, '\0');
173
174 // Insert the pattern into the tree. Each character finds a dict
175 // another level down in the tree, and leaf nodes have the list of
176 // points.
177
178 //printf("insertPattern %s\n", chars);
179
180 trieBuilder->insert (chars, points.getArray ());
181 delete[] chars;
182}
183
185{
186 Vector<Integer> *breaks = new Vector<Integer> (1, true);
187
188 int len = strlen (s);
189 for (int i = 0; i < len - 1; i++)
190 if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
191 breaks->put (new Integer (i - 2 * breaks->size()));
192
193 char *noHyphens = new char[len - 2 * breaks->size() + 1];
194 int j = 0;
195 for (int i = 0; i < len; ) {
196 if(i < len - 1 &&
197 (unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
198 i += 2;
199 else
200 noHyphens[j++] = s[i++];
201 }
202 noHyphens[j] = 0;
203
204 exceptions->put (new String (noHyphens), breaks);
205 delete[] noHyphens;
206}
207
213{
214 // Short words aren't hyphenated.
215 return (strlen (word) > 4); // TODO UTF-8?
216}
217
227{
228 return isAlpha (decodeUtf8 (s));
229}
230
235 const char *word, int *numBreaks)
236{
237 if ((trie == NULL && exceptions == NULL) || !isHyphenationCandidate (word)) {
238 *numBreaks = 0;
239 return NULL;
240 }
241
242 char *wordLc = platform->textToLower (word, strlen (word));
243
244 int start = 0;
245 SimpleVector <int> breakPos (1);
246
247 // Split the original word up, ignore anything but characters, and
248 // collect all break points, so that they fit to the original
249 // word. (The latter is what the offset in the call of
250 // hyphenateSingleWord() is for.)
251 while (true) {
252 while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
253 start = platform->nextGlyph (wordLc, start);
254
255 if (wordLc[start] == 0)
256 break;
257
258 int end = start, i = end;
259 while (wordLc[i]) {
260 if (!isCharPartOfActualWord (wordLc + i))
261 break;
262 else
263 end = i;
264 i = platform->nextGlyph (wordLc, i);
265 }
266 end = platform->nextGlyph (wordLc, end);
267
268 int nextStart;
269 if (wordLc[end]) {
270 nextStart = platform->nextGlyph (wordLc, end);
271 wordLc[end] = 0;
272 } else
273 nextStart = end;
274
275 hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
276 start = nextStart;
277 }
278
279 free (wordLc);
280
281 *numBreaks = breakPos.size ();
282 if (*numBreaks == 0)
283 return NULL;
284 else {
285 return breakPos.detachArray ();
286 }
287}
288
294 char *wordLc, int offset,
295 SimpleVector <int> *breakPos)
296{
297 // If the word is an exception, get the stored points.
298 Vector <Integer> *exceptionalBreaks;
299 ConstString key (wordLc);
300 if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
301 for (int i = 0; i < exceptionalBreaks->size(); i++) {
302 breakPos->increase ();
303 breakPos->set (breakPos->size() - 1,
304 exceptionalBreaks->get(i)->getValue() + offset);
305 }
306 return;
307 }
308
309
310 // trie == NULL means that there is no pattern file.
311 if (trie == NULL)
312 return;
313
314 char *work = new char[strlen (wordLc) + 3];
315 strcpy (work, ".");
316 strcat (work, wordLc);
317 strcat (work, ".");
318
319 int l = strlen (work);
320 SimpleVector <int> points (l + 1);
321 points.setSize (l + 1, 0);
322
323 for (int i = 0; i < l; i++) {
324 int state = trie->root;
325
326 for (int j = i; j < l && trie->validState (state); j++) {
327 const char *p = trie->getData((unsigned char) work[j], &state);
328
329 if (p) {
330 for (int k = 0; p[k]; k++)
331 points.set(i + k,
332 lout::misc::max (points.get (i + k), p[k] - '0'));
333 }
334 }
335 }
336
337 delete[] work;
338
339 // No hyphens in the first two chars or the last two.
340 // Characters are not bytes, so UTF-8 characters must be counted.
341 const char *s = nextUtf8Char (wordLc);
342 if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
343 // First two characters.
344 int bytesStart = s - wordLc;
345 for (int i = 0; i < bytesStart; i++)
346 points.set (i + 1, 0);
347
348 // Last two characters: instead of iterating back from the end,
349 // we simply iterate from the start to the end and count the
350 // characters.
351
352 int lenBytes = strlen (wordLc);
353 int lenUtf8 = numUtf8Chars (wordLc);
354 int bytesEnd = 0;
355
356 s = wordLc;
357 for (int i = 0; s; s = nextUtf8Char (s), i++) {
358 if (i == lenUtf8 - 2)
359 bytesEnd = lenBytes - (s - wordLc);
360 }
361
362 for (int i = 0; i < bytesEnd; i++)
363 points.set (points.size() - 2 - i, 0);
364 }
365
366 // Examine the points to build the break point list.
367 int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2);
368 for (int i = 0; i < n; i++) {
369 if (points.get(i + 2) % 2) {
370 breakPos->increase ();
371 breakPos->set (breakPos->size() - 1, i + 1 + offset);
372 }
373 }
374}
375
377
379{
380 this->pack = pack;
381 dataList = new SimpleVector <DataEntry> (10000);
382 stateStack = new SimpleVector <StackEntry> (10);
383 tree = new SimpleVector <Trie::TrieNode> (20000);
384 dataZone = new ZoneAllocator (1024);
386}
387
389{
390 delete dataList;
391 delete stateStack;
392 delete tree;
393 delete dataZone;
394}
395
396void TrieBuilder::insert (const char *key, const char *value)
397{
398 dataList->increase ();
399 dataList->getLastRef ()->key = (unsigned char *) dStrdup(key);
400 dataList->getLastRef ()->value = dataZone->strdup (value);
401}
402
403int TrieBuilder::keyCompare (const void *p1, const void *p2)
404{
405 DataEntry *pd1 = (DataEntry *) p1;
406 DataEntry *pd2 = (DataEntry *) p2;
407
408 return strcmp ((char *) pd1->key, (char *) pd2->key);
409}
410
412{
413 int i, j;
414
415 if (state->count == 0)
416 return 0;
417
418 if (root) {
419 i = 0; // we reseve slot 0 for the root state
420 } else {
421 /* The bigger pack is the more slots we check and the smaller
422 * the trie will be, but CPU consumption also increases.
423 * Reasonable values for pack seemt to be between 256 and 1024.
424 */
425 i = tree->size () - pack + 2 * state->count;
426
427 if (i < 256) // reserve first 256 entries for the root state
428 i = 256;
429 }
430
431 for (;; i++) {
432 if (i + 256 > tree->size ())
433 tree->setSize (i + 256, trieNodeNull);
434
435 for (j = 1; j < 256; j++) {
436 Trie::TrieNode *tn = tree->getRef(i + j);
437
438 if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
439 break;
440 }
441
442 if (j == 256) // found a suitable slot
443 break;
444 }
445
446 for (int j = 1; j < 256; j++) {
447 Trie::TrieNode *tn = tree->getRef(i + j);
448
449 if (state->next[j] || state->data[j]) {
450 tn->c = j;
451 tn->next = state->next[j];
452 tn->data = state->data[j];
453 }
454 }
455
456 assert (root || i >= 256);
457 assert (!root || i == 0);
458 return i;
459}
460
461void TrieBuilder::stateStackPush (unsigned char c)
462{
463 stateStack->increase ();
464 StackEntry *e = stateStack->getLastRef ();
465 memset (e, 0, sizeof (StackEntry));
466 e->c = c;
467}
468
470{
471 int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
472 unsigned char c = stateStack->getLastRef ()->c;
473 const char *data = stateStack->getLastRef ()->data1;
474
475 stateStack->setSize (stateStack->size () - 1);
476
477 if (stateStack->size () > 0) {
478 assert (stateStack->getLastRef ()->next[c] == 0);
479 assert (stateStack->getLastRef ()->data[c] == NULL);
480 stateStack->getLastRef ()->next[c] = next;
481 stateStack->getLastRef ()->data[c] = data;
482 stateStack->getLastRef ()->count++;
483 }
484
485 return next;
486}
487
489{
490 // we need to sort the patterns as byte strings not as unicode
491 qsort (dataList->getArray (), dataList->size (),
492 sizeof (DataEntry), keyCompare);
493
494 for (int i = 0; i < dataList->size (); i++) {
495 insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
496 free (dataList->getRef (i)->key);
497 }
498
499 while (stateStack->size ())
500 stateStackPop ();
501
502 int size = tree->size ();
503 Trie *trie = new Trie(tree->detachArray(), size, true, dataZone);
504 dataZone = NULL;
505 return trie;
506}
507
508void TrieBuilder::insertSorted (unsigned char *s, const char *data)
509{
510 int len = strlen((char*)s);
511
512 for (int i = 0; i < len; i++) {
513 if (stateStack->size () > i + 1 &&
514 stateStack->getRef (i + 1)->c != s[i]) {
515 for (int j = stateStack->size () - 1; j >= i + 1; j--)
517 }
518
519 if (i + 1 >= stateStack->size ())
520 stateStackPush(s[i]);
521 }
522
523 while (stateStack->size () > len + 1)
525
526 assert (stateStack->size () == len + 1);
527 stateStack->getLastRef ()->data1 = data;
528}
529
530Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
531{
532 this->array = array;
533 this->size = size;
534 this->freeArray = freeArray;
535 this->dataZone = dataZone;
536}
537
539{
540 delete dataZone;
541 if (freeArray)
542 free(array);
543}
544
545void Trie::save (FILE *file)
546{
547 for (int i = 0; i < size; i++) {
548 Trie::TrieNode *tn = &array[i];
549
550 if (tn->data)
551 fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
552 else
553 fprintf(file, "%u, %u\n", tn->c, tn->next);
554 }
555}
556
557int Trie::load (FILE *file)
558{
559 int next, c, maxNext = 0;
560 SimpleVector <TrieNode> tree (100);
561 dataZone = new ZoneAllocator (1024);
562
563 while (!feof (file)) {
564 char buf[LEN + 1];
565 char *s = fgets (buf, LEN, file);
566
567 if (!s)
568 continue;
569
570 char data[LEN + 1];
571 int n = sscanf (s, "%d, %d, %s", &c, &next, data);
572
573 if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
574 tree.increase ();
575 tree.getLastRef ()->c = c;
576 tree.getLastRef ()->next = next;
577 if (n >= 3)
578 tree.getLastRef ()->data = dataZone->strdup (data);
579 else
580 tree.getLastRef ()->data = NULL;
581
582 if (next > maxNext)
583 maxNext = next;
584 } else {
585 goto error;
586 }
587 }
588
589 if (maxNext >= tree.size ())
590 goto error;
591
592 size = tree.size ();
593 array = tree.detachArray ();
594 freeArray = true;
595 return 0;
596
597error:
598 delete dataZone;
599 dataZone = NULL;
600 return 1;
601}
602
603} // namespace dw
void insertException(char *s)
bool isCharPartOfActualWord(char *s)
Test whether the character on which "s" points (UTF-8) is an actual part of the word.
void hyphenateSingleWord(core::Platform *platform, char *wordLc, int offset, lout::misc::SimpleVector< int > *breakPos)
Hyphenate a single word, which only consists of lowercase characters.
static lout::container::typed::HashTable< lout::object::String, Hyphenator > * hyphenators
Definition hyphenator.hh:89
static Hyphenator * getHyphenator(const char *language)
int * hyphenateWord(core::Platform *platform, const char *word, int *numBreaks)
Given a word, returns a list of the possible hyphenation points.
static bool isHyphenationCandidate(const char *word)
Simple test to avoid much costs.
lout::container::typed::HashTable< lout::object::ConstString, lout::container::typed::Vector< lout::object::Integer > > * exceptions
Definition hyphenator.hh:94
Hyphenator(const char *patFile, const char *excFile, int pack=256)
Definition hyphenator.cc:49
void insertPattern(TrieBuilder *trieBuilder, char *s)
void insertSorted(unsigned char *key, const char *value)
lout::misc::ZoneAllocator * dataZone
Definition hyphenator.hh:70
static int keyCompare(const void *p1, const void *p2)
int insertState(StackEntry *state, bool root)
TrieBuilder(int pack)
lout::misc::SimpleVector< Trie::TrieNode > * tree
Definition hyphenator.hh:67
void stateStackPush(unsigned char c)
lout::misc::SimpleVector< StackEntry > * stateStack
Definition hyphenator.hh:69
lout::misc::SimpleVector< DataEntry > * dataList
Definition hyphenator.hh:68
Trie * createTrie()
void insert(const char *key, const char *value)
static Trie::TrieNode trieNodeNull
Definition hyphenator.hh:66
bool freeArray
Definition hyphenator.hh:21
void save(FILE *file)
TrieNode * array
Definition hyphenator.hh:19
bool validState(int state)
Definition hyphenator.hh:30
lout::misc::ZoneAllocator * dataZone
Definition hyphenator.hh:22
const char * getData(unsigned char c, int *state)
Definition hyphenator.hh:31
Trie(TrieNode *array=NULL, int size=0, bool freeArray=false, lout::misc::ZoneAllocator *dataZone=NULL)
int load(FILE *file)
static const int root
Definition hyphenator.hh:29
An interface to encapsulate some platform dependencies.
Definition platform.hh:17
void put(K *key, V *value)
Definition container.hh:542
Typed version of container::untyped::Vector.
Definition container.hh:447
void put(T *newElement, int newPos=-1)
Definition container.hh:452
Simple (simpler than container::untyped::Vector and container::typed::Vector) template based vector.
Definition misc.hh:115
void setSize(int newSize)
Set the size explicitly.
Definition misc.hh:188
void increase()
Increase the vector size by one.
Definition misc.hh:181
T * getArray() const
Definition misc.hh:166
void set(int i, T t)
Store an object in the vector.
Definition misc.hh:267
T get(int i) const
Return the one element, explicitly.
Definition misc.hh:222
int size() const
Return the number of elements put into this vector.
Definition misc.hh:162
T * getLastRef() const
Return the reference of the last element (convenience method).
Definition misc.hh:246
A simple allocator optimized to handle many small chunks of memory.
Definition misc.hh:631
const char * strdup(const char *str)
Definition misc.hh:689
An object::Object wrapper for constant strings (char*).
Definition object.hh:163
An object::Object wrapper for int's.
Definition object.hh:127
An object::Object wrapper for strings (char*).
Definition object.hh:186
char * dStrdup(const char *s)
Definition dlib.c:77
static void error(char *msg)
Definition dpidc.c:39
static FltkPlatform * platform
#define LEN
Definition hyphenator.cc:31
Dw is in this namespace, or sub namespaces of this one.
This namespace provides thin wrappers, implemented as C++ templates, to gain type-safety.
Definition container.hh:387
Miscellaneous stuff, which does not fit anywhere else.
Definition misc.cc:31
T min(T a, T b)
Definition misc.hh:40
T max(T a, T b)
Definition misc.hh:41
Here, some common classes (or interfaces) are defined, to standardize the access to other classes.
Definition object.cc:30
Stuff dealing with Unicode characters: UTF-8, character classes etc.
Definition unicode.cc:28
int numUtf8Chars(const char *s)
Definition unicode.cc:162
int decodeUtf8(const char *s)
Definition unicode.cc:73
bool isAlpha(int ch)
Returns whether a given unicode character is an alphabetic character.
Definition unicode.cc:68
const char * nextUtf8Char(const char *s)
Definition unicode.cc:110
const char * data[256]
Definition hyphenator.hh:56
unsigned char c
Definition hyphenator.hh:13
const char * data
Definition hyphenator.hh:15