25#include "../lout/misc.hh"
26#include "../lout/unicode.hh"
47 new HashTable <String, Hyphenator> (
true,
true);
53 int bufLen = strlen (patFile) + 5 + 1;
54 char *buf =
new char[bufLen];
55 snprintf(buf, bufLen,
"%s.trie", patFile);
56 FILE *trieF = fopen (buf,
"r");
70 FILE *patF = fopen (patFile,
"r");
73 while (!feof (patF)) {
75 char *s = fgets (buf,
LEN, patF);
76 if (s && s[0] !=
'%') {
92 FILE *excF = fopen (excFile,
"r");
94 exceptions =
new HashTable <ConstString, Vector <Integer> > (
true,
true);
95 while (!feof (excF)) {
97 char *s = fgets (buf,
LEN, excF);
98 if (s && s[0] !=
'%') {
101 if (s[l - 1] ==
'\n')
124 int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
125 char *patFile =
new char[patFileLen];
126 snprintf (patFile, patFileLen,
"%s/hyphenation/%s.pat",
128 int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
129 char *excFile =
new char[excFileLen];
130 snprintf (excFile, excFileLen,
"%s/hyphenation/%s.exc",
136 hyphenator =
new Hyphenator (patFile, excFile);
154 char *chars =
new char[l + 1];
161 for (
int i = 0; s[i]; i++) {
162 if (s[i] >=
'0' && s[i] <=
'9') {
163 points.
setSize(numChars + 1,
'0');
164 points.
set(numChars, s[i]);
166 chars[numChars++] = s[i];
171 points.
setSize(numChars + 2,
'0');
172 points.
set(numChars + 1,
'\0');
188 int len = strlen (s);
189 for (
int i = 0; i < len - 1; i++)
190 if((
unsigned char)s[i] == 0xc2 && (
unsigned char)s[i + 1] == 0xad)
193 char *noHyphens =
new char[len - 2 * breaks->
size() + 1];
195 for (
int i = 0; i < len; ) {
197 (
unsigned char)s[i] == 0xc2 && (
unsigned char)s[i + 1] == 0xad)
200 noHyphens[j++] = s[i++];
215 return (strlen (word) > 4);
235 const char *word,
int *numBreaks)
242 char *wordLc =
platform->textToLower (word, strlen (word));
245 SimpleVector <int> breakPos (1);
253 start =
platform->nextGlyph (wordLc, start);
255 if (wordLc[start] == 0)
258 int end = start, i = end;
264 i =
platform->nextGlyph (wordLc, i);
266 end =
platform->nextGlyph (wordLc, end);
270 nextStart =
platform->nextGlyph (wordLc, end);
281 *numBreaks = breakPos.
size ();
294 char *wordLc,
int offset,
295 SimpleVector <int> *breakPos)
298 Vector <Integer> *exceptionalBreaks;
301 for (
int i = 0; i < exceptionalBreaks->
size(); i++) {
303 breakPos->
set (breakPos->
size() - 1,
304 exceptionalBreaks->
get(i)->getValue() + offset);
314 char *work =
new char[strlen (wordLc) + 3];
316 strcat (work, wordLc);
319 int l = strlen (work);
320 SimpleVector <int> points (l + 1);
323 for (
int i = 0; i < l; i++) {
327 const char *p =
trie->
getData((
unsigned char) work[j], &state);
330 for (
int k = 0; p[k]; k++)
344 int bytesStart = s - wordLc;
345 for (
int i = 0; i < bytesStart; i++)
346 points.
set (i + 1, 0);
352 int lenBytes = strlen (wordLc);
358 if (i == lenUtf8 - 2)
359 bytesEnd = lenBytes - (s - wordLc);
362 for (
int i = 0; i < bytesEnd; i++)
363 points.
set (points.
size() - 2 - i, 0);
368 for (
int i = 0; i < n; i++) {
369 if (points.
get(i + 2) % 2) {
371 breakPos->
set (breakPos->
size() - 1, i + 1 + offset);
381 dataList =
new SimpleVector <DataEntry> (10000);
382 stateStack =
new SimpleVector <StackEntry> (10);
383 tree =
new SimpleVector <Trie::TrieNode> (20000);
408 return strcmp ((
char *) pd1->
key, (
char *) pd2->
key);
415 if (state->
count == 0)
432 if (i + 256 >
tree->size ())
435 for (j = 1; j < 256; j++) {
438 if (tn->
c == j || ((state->
next[j] || state->
data[j]) && tn->
c != 0))
446 for (
int j = 1; j < 256; j++) {
449 if (state->
next[j] || state->
data[j]) {
456 assert (root || i >= 256);
457 assert (!root || i == 0);
472 unsigned char c =
stateStack->getLastRef ()->c;
473 const char *data =
stateStack->getLastRef ()->data1;
478 assert (
stateStack->getLastRef ()->next[c] == 0);
479 assert (
stateStack->getLastRef ()->data[c] == NULL);
494 for (
int i = 0; i <
dataList->size (); i++) {
502 int size =
tree->size ();
510 int len = strlen((
char*)s);
512 for (
int i = 0; i < len; i++) {
515 for (
int j =
stateStack->size () - 1; j >= i + 1; j--)
547 for (
int i = 0; i <
size; i++) {
551 fprintf(file,
"%u, %u, %s\n", tn->
c, tn->
next, tn->
data);
553 fprintf(file,
"%u, %u\n", tn->
c, tn->
next);
559 int next, c, maxNext = 0;
560 SimpleVector <TrieNode> tree (100);
563 while (!feof (file)) {
565 char *s = fgets (buf,
LEN, file);
571 int n = sscanf (s,
"%d, %d, %s", &c, &next, data);
573 if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
589 if (maxNext >= tree.
size ())
void insertException(char *s)
bool isCharPartOfActualWord(char *s)
Test whether the character on which "s" points (UTF-8) is an actual part of the word.
void hyphenateSingleWord(core::Platform *platform, char *wordLc, int offset, lout::misc::SimpleVector< int > *breakPos)
Hyphenate a single word, which only consists of lowercase characters.
static lout::container::typed::HashTable< lout::object::String, Hyphenator > * hyphenators
static Hyphenator * getHyphenator(const char *language)
int * hyphenateWord(core::Platform *platform, const char *word, int *numBreaks)
Given a word, returns a list of the possible hyphenation points.
static bool isHyphenationCandidate(const char *word)
Simple test to avoid much costs.
lout::container::typed::HashTable< lout::object::ConstString, lout::container::typed::Vector< lout::object::Integer > > * exceptions
Hyphenator(const char *patFile, const char *excFile, int pack=256)
void insertPattern(TrieBuilder *trieBuilder, char *s)
void insertSorted(unsigned char *key, const char *value)
lout::misc::ZoneAllocator * dataZone
static int keyCompare(const void *p1, const void *p2)
int insertState(StackEntry *state, bool root)
lout::misc::SimpleVector< Trie::TrieNode > * tree
void stateStackPush(unsigned char c)
lout::misc::SimpleVector< StackEntry > * stateStack
lout::misc::SimpleVector< DataEntry > * dataList
void insert(const char *key, const char *value)
static Trie::TrieNode trieNodeNull
bool validState(int state)
lout::misc::ZoneAllocator * dataZone
const char * getData(unsigned char c, int *state)
Trie(TrieNode *array=NULL, int size=0, bool freeArray=false, lout::misc::ZoneAllocator *dataZone=NULL)
void put(K *key, V *value)
Typed version of container::untyped::Vector.
void put(T *newElement, int newPos=-1)
Simple (simpler than container::untyped::Vector and container::typed::Vector) template based vector.
void setSize(int newSize)
Set the size explicitly.
void increase()
Increase the vector size by one.
void set(int i, T t)
Store an object in the vector.
T get(int i) const
Return the one element, explicitly.
int size() const
Return the number of elements put into this vector.
T * getLastRef() const
Return the reference of the last element (convenience method).
A simple allocator optimized to handle many small chunks of memory.
const char * strdup(const char *str)
An object::Object wrapper for constant strings (char*).
An object::Object wrapper for int's.
An object::Object wrapper for strings (char*).
char * dStrdup(const char *s)
static void error(char *msg)
static FltkPlatform * platform
Dw is in this namespace, or sub namespaces of this one.
This namespace provides thin wrappers, implemented as C++ templates, to gain type-safety.
Miscellaneous stuff, which does not fit anywhere else.
Here, some common classes (or interfaces) are defined, to standardize the access to other classes.
Stuff dealing with Unicode characters: UTF-8, character classes etc.
int numUtf8Chars(const char *s)
int decodeUtf8(const char *s)
bool isAlpha(int ch)
Returns whether a given unicode character is an alphabetic character.
const char * nextUtf8Char(const char *s)