Dillo v3.1.1-111-gd4f56d0d
Loading...
Searching...
No Matches
misc.c
Go to the documentation of this file.
1/*
2 * File: misc.c
3 *
4 * Copyright (C) 2000-2007 Jorge Arellano Cid <jcid@dillo.org>,
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <stdio.h>
13#include <stdlib.h>
14#include <string.h>
15#include <ctype.h>
16#include <assert.h>
17
18#include "utf8.hh"
19#include "msg.h"
20#include "misc.h"
21
26char *a_Misc_escape_chars(const char *str, const char *esc_set)
27{
28 static const char *const hex = "0123456789ABCDEF";
29 char *p = NULL;
30 Dstr *dstr;
31 int i;
32
33 dstr = dStr_sized_new(64);
34 for (i = 0; str[i]; ++i) {
35 if (str[i] <= 0x1F || str[i] == 0x7F || strchr(esc_set, str[i])) {
36 dStr_append_c(dstr, '%');
37 dStr_append_c(dstr, hex[(str[i] >> 4) & 15]);
38 dStr_append_c(dstr, hex[str[i] & 15]);
39 } else {
40 dStr_append_c(dstr, str[i]);
41 }
42 }
43 p = dstr->str;
44 dStr_free(dstr, FALSE);
45
46 return p;
47}
48
49#define TAB_SIZE 8
53int
54a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
55{
56 int j, pos = 0, written = 0, old_pos, char_len;
57 uint_t code;
58 static const int combining_char_space = 32;
59
60 while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
61 code = a_Utf8_decode(*start, end, &char_len);
62
63 if (code == '\t') {
64 /* Fill with whitespaces until the next tab. */
65 old_pos = pos;
66 pos += TAB_SIZE - (pos % TAB_SIZE);
67 for (j = old_pos; j < pos; j++)
68 buf[written++] = ' ';
69 } else {
70 assert(char_len <= 4);
71 for (j = 0; j < char_len; j++)
72 buf[written++] = (*start)[j];
73 pos++;
74 }
75
76 *start += char_len;
77 }
78
79 /* If following chars are combining chars (e.g. accents) add them to the
80 * buffer. We have reserved combining_char_space bytes for this.
81 * If there should be more combining chars, we split nevertheless.
82 */
83 while (*start < end && written < buflen - 4) {
84 code = a_Utf8_decode(*start, end, &char_len);
85
86 if (! a_Utf8_combining_char(code))
87 break;
88
89 assert(char_len <= 4);
90 for (j = 0; j < char_len; j++)
91 buf[written++] = (*start)[j];
92
93 *start += char_len;
94 }
95
96 return written;
97}
98
99/* TODO: could use dStr ADT! */
100typedef struct {
101 const char *str;
102 int len;
103} ContentType_t;
104
105static const ContentType_t MimeTypes[] = {
106 { "application/octet-stream", 24 },
107 { "application/xhtml+xml", 21 },
108 { "text/html", 9 },
109 { "text/plain", 10 },
110 { "image/gif", 9 },
111 { "image/png", 9 },
112 { "image/jpeg", 10 },
113 { NULL, 0 }
114};
115
125
136int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
137{
138 size_t i, non_ascci, non_ascci_text, bin_chars;
139 char *p = Data;
140 int st = 1; /* default to "doubt' */
141 DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
142
143 /* HTML try */
144 for (i = 0; i < Size && dIsspace(p[i]); ++i);
145 if ((Size - i >= 5 && !dStrnAsciiCasecmp(p+i, "<html", 5)) ||
146 (Size - i >= 5 && !dStrnAsciiCasecmp(p+i, "<head", 5)) ||
147 (Size - i >= 6 && !dStrnAsciiCasecmp(p+i, "<title", 6)) ||
148 (Size - i >= 14 && !dStrnAsciiCasecmp(p+i, "<!doctype html", 14)) ||
149 /* this line is workaround for FTP through the Squid proxy and Doxygen */
150 (Size - i >= 9 && !dStrnAsciiCasecmp(p+i, "<!-- HTML", 9))) {
151
152 Type = DT_TEXT_HTML;
153 st = 0;
154 /* Images */
155 } else if (Size >= 4 && !strncmp(p, "GIF8", 4)) {
156 Type = DT_IMAGE_GIF;
157 st = 0;
158 } else if (Size >= 4 && !strncmp(p, "\x89PNG", 4)) {
159 Type = DT_IMAGE_PNG;
160 st = 0;
161 } else if (Size >= 2 && !strncmp(p, "\xff\xd8", 2)) {
162 /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
163 * at the character representation should be machine independent. */
164 Type = DT_IMAGE_JPG;
165 st = 0;
166
167 /* Text */
168 } else {
169 /* Heuristic for "text/plain"
170 * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
171 * All in the above set regard [00-31] as control characters.
172 * LATIN1: [7F-9F] unused
173 * CP-1251 {7F,98} unused (two characters).
174 *
175 * We'll use [0-31] as indicators of non-text content.
176 * Better heuristics are welcomed! :-) */
177
178 non_ascci = non_ascci_text = bin_chars = 0;
179 Size = MIN (Size, 256);
180 for (i = 0; i < Size; i++) {
181 int ch = (uchar_t) p[i];
182 if (ch < 32 && !dIsspace(ch))
183 ++bin_chars;
184 if (ch > 126)
185 ++non_ascci;
186 if (ch > 190)
187 ++non_ascci_text;
188 }
189 if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) {
190 /* Let's say text: if "rare" chars are <= 10% */
191 Type = DT_TEXT_PLAIN;
192 } else if (Size > 0) {
193 /* a special check for UTF-8 */
194 Size = a_Utf8_end_of_char(p, Size - 1) + 1;
195 if (a_Utf8_test(p, Size) > 0)
196 Type = DT_TEXT_PLAIN;
197 }
198 if (Size >= 256)
199 st = 0;
200 }
201
202 *PT = MimeTypes[Type].str;
203 return st;
204}
205
210void a_Misc_parse_content_type(const char *type, char **major, char **minor,
211 char **charset)
212{
213 static const char tspecials_space[] = "()<>@,;:\\\"/[]?= ";
214 const char *str, *s;
215
216 if (major)
217 *major = NULL;
218 if (minor)
219 *minor = NULL;
220 if (charset)
221 *charset = NULL;
222 if (!(str = type))
223 return;
224
225 for (s = str; *s && d_isascii((uchar_t)*s) && !iscntrl((uchar_t)*s) &&
226 !strchr(tspecials_space, *s); s++) ;
227 if (major)
228 *major = dStrndup(str, s - str);
229
230 if (*s == '/') {
231 for (str = ++s; *s && d_isascii((uchar_t)*s) && !iscntrl((uchar_t)*s) &&
232 !strchr(tspecials_space, *s); s++) ;
233 if (minor)
234 *minor = dStrndup(str, s - str);
235 }
236 if (charset && *s &&
237 (dStrnAsciiCasecmp(type, "text/", 5) == 0 ||
238 dStrnAsciiCasecmp(type, "application/xhtml+xml", 21) == 0)) {
239 /* "charset" parameter defined for text media type in RFC 2046,
240 * application/xhtml+xml in RFC 3236.
241 *
242 * Note that RFC 3023 lists some main xml media types and provides
243 * the convention of using the "+xml" minor type suffix for other
244 * xml types, so it would be reasonable to check for that suffix if
245 * we have need to care about various xml types someday.
246 */
247 const char terminators[] = " ;\t";
248 const char key[] = "charset";
249
250 if ((s = dStriAsciiStr(str, key)) &&
251 (s == str || strchr(terminators, s[-1]))) {
252 s += sizeof(key) - 1;
253 for ( ; *s == ' ' || *s == '\t'; ++s);
254 if (*s == '=') {
255 size_t len;
256 for (++s; *s == ' ' || *s == '\t'; ++s);
257 if ((len = strcspn(s, terminators))) {
258 if (*s == '"' && s[len-1] == '"' && len > 1) {
259 /* quoted string */
260 s++;
261 len -= 2;
262 }
263 *charset = dStrndup(s, len);
264 }
265 }
266 }
267 }
268}
269
274int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
275{
276 char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
277 int ret;
278
279 if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
280 return 0;
281 if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
282 return 1;
283
284 a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
285 a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
286
287 if (major1 && major2 && !dStrAsciiCasecmp(major1, major2) &&
288 minor1 && minor2 && !dStrAsciiCasecmp(minor1, minor2) &&
289 ((!charset1 && !charset2) ||
290 (charset1 && charset2 && !dStrAsciiCasecmp(charset1, charset2)) ||
291 (!charset1 && charset2 && !dStrAsciiCasecmp(charset2, "UTF-8")) ||
292 (charset1 && !charset2 && !dStrAsciiCasecmp(charset1, "UTF-8")))) {
293 ret = 0;
294 } else {
295 ret = 1;
296 }
297 dFree(major1); dFree(major2);
298 dFree(minor1); dFree(minor2);
299 dFree(charset1); dFree(charset2);
300
301 return ret;
302}
303
321int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
322{
323 int i;
324 int st = -1;
325
326 _MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
327
328 if (!EntryType)
329 return 0; /* there's no mismatch without server type */
330
331 for (i = 1; MimeTypes[i].str; ++i)
332 if (dStrnAsciiCasecmp(EntryType, MimeTypes[i].str, MimeTypes[i].len) ==0)
333 break;
334
335 if (!MimeTypes[i].str) {
336 /* type not found, no mismatch */
337 st = 0;
338 } else if (dStrnAsciiCasecmp(EntryType, "image/", 6) == 0 &&
339 !dStrnAsciiCasecmp(DetectedType, MimeTypes[i].str,
340 MimeTypes[i].len)){
341 /* An image, and there's an exact match */
342 st = 0;
343 } else if (dStrnAsciiCasecmp(EntryType, "text/", 5) ||
344 dStrnAsciiCasecmp(DetectedType, "application/", 12)) {
345 /* Not an application sent as text */
346 st = 0;
347 } else if (dStrnAsciiCasecmp(EntryType, "application/xhtml+xml", 21) &&
348 dStrnAsciiCasecmp(DetectedType, "text/html", 9)) {
349 /* XML version of HTML */
350 st = 0;
351 }
352 _MSG("Type check: %s\n", st == 0 ? "MATCH" : "MISMATCH");
353
354 return st;
355}
356
360int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h)
361{
362 char *p, *t1, *t2;
363 int n1, n2;
364 int ret = 0;
365
366 if ((p = strchr(str, 'x')) || (p = strchr(str, 'X'))) {
367 n1 = strtol(str, &t1, 10);
368 n2 = strtol(++p, &t2, 10);
369 if (t1 != str && t2 != p) {
370 *w = n1;
371 *h = n2;
372 ret = 1;
373 /* parse x,y now */
374 p = t2;
375 n1 = strtol(p, &t1, 10);
376 n2 = strtol(t1, &t2, 10);
377 if (t1 != p && t2 != t1) {
378 *x = n1;
379 *y = n2;
380 }
381 }
382 }
383 _MSG("geom: w,h,x,y = (%d,%d,%d,%d)\n", *w, *h, *x, *y);
384 return ret;
385}
386
391int a_Misc_parse_search_url(char *source, char **label, char **urlstr)
392{
393 static char buf[32];
394 char *p, *q;
395 int ret = -1;
396
397 if ((p = strrchr(source, ' '))) {
398 /* label and url pair */
399 strncpy(buf,source,MIN(p-source,31));
400 buf[MIN(p-source,31)] = 0;
401 source = p+1;
402 if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
403 *urlstr = source;
404 ret = 0;
405 }
406 } else {
407 /* url only, make a custom label */
408 if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
409 strncpy(buf,p+2,MIN(q-p-2,31));
410 buf[MIN(q-p-2,31)] = 0;
411 *urlstr = source;
412 ret = 0;
413 }
414 }
415 *label = buf;
416 if (ret == -1)
417 MSG("Invalid search_url: \"%s\"\n", source);
418 return ret;
419}
420
425char *a_Misc_encode_base64(const char *in)
426{
427 static const char *const base64_hex = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
428 "abcdefghijklmnopqrstuvwxyz"
429 "0123456789+/";
430 char *out = NULL;
431 int len, i = 0;
432
433 if (in == NULL) return NULL;
434 len = strlen(in);
435
436 out = (char *)dMalloc((len + 2) / 3 * 4 + 1);
437
438 for (; len >= 3; len -= 3) {
439 out[i++] = base64_hex[in[0] >> 2];
440 out[i++] = base64_hex[((in[0]<<4) & 0x30) | (in[1]>>4)];
441 out[i++] = base64_hex[((in[1]<<2) & 0x3c) | (in[2]>>6)];
442 out[i++] = base64_hex[in[2] & 0x3f];
443 in += 3;
444 }
445
446 if (len > 0) {
447 unsigned char fragment;
448 out[i++] = base64_hex[in[0] >> 2];
449 fragment = (in[0] << 4) & 0x30;
450 if (len > 1) fragment |= in[1] >> 4;
451 out[i++] = base64_hex[fragment];
452 out[i++] = (len < 2) ? '=' : base64_hex[(in[1] << 2) & 0x3c];
453 out[i++] = '=';
454 }
455 out[i] = '\0';
456 return out;
457}
458
464Dstr *a_Misc_file2dstr(const char *filename)
465{
466 FILE *F_in;
467 int n;
468 char buf[4096];
469 Dstr *dstr = NULL;
470
471 if ((F_in = fopen(filename, "r"))) {
472 dstr = dStr_sized_new(4096);
473 while ((n = fread (buf, 1, 4096, F_in)) > 0) {
474 dStr_append_l(dstr, buf, n);
475 }
476 fclose(F_in);
477 }
478 return dstr;
479}
#define _MSG(...)
Definition bookmarks.c:45
#define MSG(...)
Definition bookmarks.c:46
unsigned char uchar_t
Definition d_size.h:17
unsigned int uint_t
Definition d_size.h:20
void dFree(void *mem)
Definition dlib.c:68
int dStrAsciiCasecmp(const char *s1, const char *s2)
Definition dlib.c:203
Dstr * dStr_sized_new(int sz)
Create a new string with a given size.
Definition dlib.c:254
int dStrnAsciiCasecmp(const char *s1, const char *s2, size_t n)
Definition dlib.c:215
void * dMalloc(size_t size)
Definition dlib.c:45
void dStr_free(Dstr *ds, int all)
Free a dillo string.
Definition dlib.c:337
char * dStriAsciiStr(const char *haystack, const char *needle)
Case insensitive strstr.
Definition dlib.c:184
void dStr_append_l(Dstr *ds, const char *s, int l)
Append a C string to a Dstr (providing length).
Definition dlib.c:308
void dStr_append_c(Dstr *ds, int c)
Append one character.
Definition dlib.c:349
char * dStrndup(const char *s, size_t sz)
Definition dlib.c:88
#define MIN(a, b)
Definition dlib.h:30
#define dIsspace(c)
Definition dlib.h:33
#define FALSE
Definition dlib.h:19
int a_Misc_parse_search_url(char *source, char **label, char **urlstr)
Parse dillorc's search_url string ([<label> ]<url>) Return value: -1 on error, 0 on success (and labe...
Definition misc.c:391
int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
Check the server-supplied 'Content-Type' against our detected type.
Definition misc.c:321
int a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
Takes a string and converts any tabs to spaces.
Definition misc.c:54
Dstr * a_Misc_file2dstr(const char *filename)
Load a local file into a dStr.
Definition misc.c:464
int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h)
Parse a geometry string.
Definition misc.c:360
char * a_Misc_encode_base64(const char *in)
Encodes string using base64 encoding.
Definition misc.c:425
DetectedContentType
Definition misc.c:116
@ DT_TEXT_PLAIN
Definition misc.c:120
@ DT_OCTET_STREAM
Definition misc.c:117
@ DT_IMAGE_GIF
Definition misc.c:121
@ DT_IMAGE_PNG
Definition misc.c:122
@ DT_TEXT_HTML
Definition misc.c:119
@ DT_PLACEHOLDER
Definition misc.c:118
@ DT_IMAGE_JPG
Definition misc.c:123
static const ContentType_t MimeTypes[]
Definition misc.c:105
#define TAB_SIZE
Definition misc.c:49
char * a_Misc_escape_chars(const char *str, const char *esc_set)
Escape characters as XX sequences.
Definition misc.c:26
int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
Compare two Content-Type strings.
Definition misc.c:274
int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
Detects 'Content-Type' from a data stream sample.
Definition misc.c:136
void a_Misc_parse_content_type(const char *type, char **major, char **minor, char **charset)
Parse Content-Type string, e.g., "text/html; charset=utf-8".
Definition misc.c:210
#define d_isascii(c)
Definition misc.h:11
Definition dlib.h:102
Dstr_char_t * str
Definition dlib.h:105
uint_t a_Utf8_decode(const char *str, const char *end, int *len)
Decode a single UTF-8-encoded character starting at p.
Definition utf8.cc:46
int a_Utf8_test(const char *src, unsigned int srclen)
Examine first srclen bytes of src.
Definition utf8.cc:64
uint_t a_Utf8_end_of_char(const char *str, uint_t i)
Return index of the last byte of the UTF-8-encoded character that str + i points to or into.
Definition utf8.cc:23
bool_t a_Utf8_combining_char(int unicode)
Definition utf8.cc:96