#include "d_size.h"

Include dependency graph for utf8.hh:

This graph shows which files directly or indirectly include this file:

Functions
uint_t	a_Utf8_end_of_char (const char *str, uint_t i)
	Return index of the last byte of the UTF-8-encoded character that str + i points to or into.

uint_t	a_Utf8_decode (const char , const char end, int *len)
	Decode a single UTF-8-encoded character starting at p.

int	a_Utf8_encode (unsigned int ucs, char *buf)
	Write UTF-8 encoding of ucs into buf and return number of bytes written.

int	a_Utf8_test (const char *src, unsigned int srclen)
	Examine first srclen bytes of src.

bool_t	a_Utf8_ideographic (const char s, const char end, int *len)
	Does s point to a UTF-8-encoded ideographic character?.

bool_t	a_Utf8_combining_char (int unicode)

int	a_Utf8_char_count (const char *str, int len)

Variables
static const char	utf8_replacement_char [] = "\xEF\xBF\xBD"
	Unicode replacement character U+FFFD.

static const char	utf8_zero_width_space [] = "\xE2\x80\x8B"
	Unicode zero width space U+200B.

Function Documentation

◆ a_Utf8_char_count()

int a_Utf8_char_count	(	const char *	str,
		int	len
	)

Definition at line 104 of file utf8.cc.

Referenced by Keys::parseKey().

◆ a_Utf8_combining_char()

bool_t a_Utf8_combining_char ( int unicode )

Definition at line 96 of file utf8.cc.

Referenced by a_Misc_expand_tabs().

◆ a_Utf8_decode()

uint_t a_Utf8_decode	(	const char *	str,
		const char *	end,
		int *	len
	)

Decode a single UTF-8-encoded character starting at p.

The resulting Unicode value (in the range 0-0x10ffff) is returned, and len is set to the number of bytes in the UTF-8 encoding. Note that utf8decode(), if given non-UTF-8 data, will interpret it as ISO-8859-1 or CP1252 if possible.

Definition at line 46 of file utf8.cc.

Referenced by a_Misc_expand_tabs(), a_Utf8_ideographic(), Keys::getKeyCmd(), and Keys::parseKey().

◆ a_Utf8_encode()

int a_Utf8_encode	(	unsigned int	ucs,
		char *	buf
	)

Write UTF-8 encoding of ucs into buf and return number of bytes written.

Definition at line 54 of file utf8.cc.

Referenced by Html_parse_numeric_charref().

◆ a_Utf8_end_of_char()

uint_t a_Utf8_end_of_char	(	const char *	str,
		uint_t	i
	)

Return index of the last byte of the UTF-8-encoded character that str + i points to or into.

Definition at line 23 of file utf8.cc.

Referenced by a_Misc_get_content_type_from_data(), a_UIcmd_set_page_title(), and a_Utf8_ideographic().

◆ a_Utf8_ideographic()

bool_t a_Utf8_ideographic	(	const char *	s,
		const char *	end,
		int *	len
	)

Does s point to a UTF-8-encoded ideographic character?.

This is based on http://unicode.org/reports/tr14/#ID plus some guesses for what might make the most sense for Dillo. Surprisingly, they include Hangul Compatibility Jamo, but they're the experts, so I'll follow along.

Definition at line 76 of file utf8.cc.

References a_Utf8_decode(), a_Utf8_end_of_char(), FALSE, and TRUE.

Referenced by Html_process_word().

◆ a_Utf8_test()

int a_Utf8_test	(	const char *	src,
		unsigned int	srclen
	)

Examine first srclen bytes of src.

Return 0 if not legal UTF-8, 1 if all ASCII, 2 if all below 0x800, 3 if all below 0x10000, and 4 otherwise.

Definition at line 64 of file utf8.cc.

Referenced by a_Misc_get_content_type_from_data().

Variable Documentation

◆ utf8_replacement_char

const char utf8_replacement_char[] = "\xEF\xBF\xBD"

static

Unicode replacement character U+FFFD.

"used to replace an incoming character whose value is unknown or otherwise unrepresentable in Unicode"

Definition at line 16 of file utf8.hh.

Referenced by Decode_charset().

◆ utf8_zero_width_space

const char utf8_zero_width_space[] = "\xE2\x80\x8B"