Dillo v3.2.0-151-g90488cbf
Loading...
Searching...
No Matches
misc.c
Go to the documentation of this file.
1/*
2 * File: misc.c
3 *
4 * Copyright (C) 2000-2007 Jorge Arellano Cid <jcid@dillo.org>,
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 */
11
12#include "misc.h"
13
14#include <stdio.h>
15#include <stdlib.h>
16#include <string.h>
17#include <assert.h>
18
19#include "utf8.hh"
20#include "msg.h"
21#include "dlib/dlib.h" /* dIsspace */
22
27char *a_Misc_escape_chars(const char *str, const char *esc_set)
28{
29 static const char *const hex = "0123456789ABCDEF";
30 char *p = NULL;
31 Dstr *dstr;
32 int i;
33
34 dstr = dStr_sized_new(64);
35 for (i = 0; str[i]; ++i) {
36 if (str[i] <= 0x1F || str[i] == 0x7F || strchr(esc_set, str[i])) {
37 dStr_append_c(dstr, '%');
38 dStr_append_c(dstr, hex[(str[i] >> 4) & 15]);
39 dStr_append_c(dstr, hex[str[i] & 15]);
40 } else {
41 dStr_append_c(dstr, str[i]);
42 }
43 }
44 p = dstr->str;
45 dStr_free(dstr, FALSE);
46
47 return p;
48}
49
50#define TAB_SIZE 8
54int
55a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
56{
57 int j, pos = 0, written = 0, old_pos, char_len;
58 uint_t code;
59 static const int combining_char_space = 32;
60
61 while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
62 code = a_Utf8_decode(*start, end, &char_len);
63
64 if (code == '\t') {
65 /* Fill with whitespaces until the next tab. */
66 old_pos = pos;
67 pos += TAB_SIZE - (pos % TAB_SIZE);
68 for (j = old_pos; j < pos; j++)
69 buf[written++] = ' ';
70 } else {
71 assert(char_len <= 4);
72 for (j = 0; j < char_len; j++)
73 buf[written++] = (*start)[j];
74 pos++;
75 }
76
77 *start += char_len;
78 }
79
80 /* If following chars are combining chars (e.g. accents) add them to the
81 * buffer. We have reserved combining_char_space bytes for this.
82 * If there should be more combining chars, we split nevertheless.
83 */
84 while (*start < end && written < buflen - 4) {
85 code = a_Utf8_decode(*start, end, &char_len);
86
87 if (! a_Utf8_combining_char(code))
88 break;
89
90 assert(char_len <= 4);
91 for (j = 0; j < char_len; j++)
92 buf[written++] = (*start)[j];
93
94 *start += char_len;
95 }
96
97 return written;
98}
99
100/* TODO: could use dStr ADT! */
101typedef struct {
102 const char *str;
103 int len;
104} ContentType_t;
105
106static const ContentType_t MimeTypes[] = {
107 { "application/octet-stream", 24 },
108 { "application/xhtml+xml", 21 },
109 { "text/html", 9 },
110 { "text/plain", 10 },
111 { "image/gif", 9 },
112 { "image/png", 9 },
113 { "image/jpeg", 10 },
114 { NULL, 0 }
115};
116
126
137int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
138{
139 size_t i, non_ascci, non_ascci_text, bin_chars;
140 char *p = Data;
141 int st = 1; /* default to "doubt' */
142 DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
143
144 /* HTML try */
145 for (i = 0; i < Size && dIsspace(p[i]); ++i);
146 if ((Size - i >= 5 && !dStrnAsciiCasecmp(p+i, "<html", 5)) ||
147 (Size - i >= 5 && !dStrnAsciiCasecmp(p+i, "<head", 5)) ||
148 (Size - i >= 6 && !dStrnAsciiCasecmp(p+i, "<title", 6)) ||
149 (Size - i >= 14 && !dStrnAsciiCasecmp(p+i, "<!doctype html", 14)) ||
150 /* this line is workaround for FTP through the Squid proxy and Doxygen */
151 (Size - i >= 9 && !dStrnAsciiCasecmp(p+i, "<!-- HTML", 9))) {
152
153 Type = DT_TEXT_HTML;
154 st = 0;
155 /* Images */
156 } else if (Size >= 4 && !strncmp(p, "GIF8", 4)) {
157 Type = DT_IMAGE_GIF;
158 st = 0;
159 } else if (Size >= 4 && !strncmp(p, "\x89PNG", 4)) {
160 Type = DT_IMAGE_PNG;
161 st = 0;
162 } else if (Size >= 2 && !strncmp(p, "\xff\xd8", 2)) {
163 /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
164 * at the character representation should be machine independent. */
165 Type = DT_IMAGE_JPG;
166 st = 0;
167
168 /* Text */
169 } else {
170 /* Heuristic for "text/plain"
171 * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
172 * All in the above set regard [00-31] as control characters.
173 * LATIN1: [7F-9F] unused
174 * CP-1251 {7F,98} unused (two characters).
175 *
176 * We'll use [0-31] as indicators of non-text content.
177 * Better heuristics are welcomed! :-) */
178
179 non_ascci = non_ascci_text = bin_chars = 0;
180 Size = MIN (Size, 256);
181 for (i = 0; i < Size; i++) {
182 int ch = (uchar_t) p[i];
183 if (ch < 32 && !dIsspace(ch))
184 ++bin_chars;
185 if (ch > 126)
186 ++non_ascci;
187 if (ch > 190)
188 ++non_ascci_text;
189 }
190 if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) {
191 /* Let's say text: if "rare" chars are <= 10% */
192 Type = DT_TEXT_PLAIN;
193 } else if (Size > 0) {
194 /* a special check for UTF-8 */
195 Size = a_Utf8_end_of_char(p, Size - 1) + 1;
196 if (a_Utf8_test(p, Size) > 0)
197 Type = DT_TEXT_PLAIN;
198 }
199 if (Size >= 256)
200 st = 0;
201 }
202
203 *PT = MimeTypes[Type].str;
204 return st;
205}
206
211void a_Misc_parse_content_type(const char *type, char **major, char **minor,
212 char **charset)
213{
214 static const char tspecials_space[] = "()<>@,;:\\\"/[]?= ";
215 const char *str, *s;
216
217 if (major)
218 *major = NULL;
219 if (minor)
220 *minor = NULL;
221 if (charset)
222 *charset = NULL;
223 if (!(str = type))
224 return;
225
226 for (s = str; *s && dIsascii((uchar_t)*s) && !dIscntrl(*s) &&
227 !strchr(tspecials_space, *s); s++) ;
228 if (major)
229 *major = dStrndup(str, s - str);
230
231 if (*s == '/') {
232 for (str = ++s; *s && dIsascii((uchar_t)*s) && !dIscntrl(*s) &&
233 !strchr(tspecials_space, *s); s++) ;
234 if (minor)
235 *minor = dStrndup(str, s - str);
236 }
237 if (charset && *s &&
238 (dStrnAsciiCasecmp(type, "text/", 5) == 0 ||
239 dStrnAsciiCasecmp(type, "application/xhtml+xml", 21) == 0)) {
240 /* "charset" parameter defined for text media type in RFC 2046,
241 * application/xhtml+xml in RFC 3236.
242 *
243 * Note that RFC 3023 lists some main xml media types and provides
244 * the convention of using the "+xml" minor type suffix for other
245 * xml types, so it would be reasonable to check for that suffix if
246 * we have need to care about various xml types someday.
247 */
248 const char terminators[] = " ;\t";
249 const char key[] = "charset";
250
251 if ((s = dStriAsciiStr(str, key)) &&
252 (s == str || strchr(terminators, s[-1]))) {
253 s += sizeof(key) - 1;
254 for ( ; *s == ' ' || *s == '\t'; ++s);
255 if (*s == '=') {
256 size_t len;
257 for (++s; *s == ' ' || *s == '\t'; ++s);
258 if ((len = strcspn(s, terminators))) {
259 if (*s == '"' && s[len-1] == '"' && len > 1) {
260 /* quoted string */
261 s++;
262 len -= 2;
263 }
264 *charset = dStrndup(s, len);
265 }
266 }
267 }
268 }
269}
270
275int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
276{
277 char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
278 int ret;
279
280 if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
281 return 0;
282 if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
283 return 1;
284
285 a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
286 a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
287
288 if (major1 && major2 && !dStrAsciiCasecmp(major1, major2) &&
289 minor1 && minor2 && !dStrAsciiCasecmp(minor1, minor2) &&
290 ((!charset1 && !charset2) ||
291 (charset1 && charset2 && !dStrAsciiCasecmp(charset1, charset2)) ||
292 (!charset1 && charset2 && !dStrAsciiCasecmp(charset2, "UTF-8")) ||
293 (charset1 && !charset2 && !dStrAsciiCasecmp(charset1, "UTF-8")))) {
294 ret = 0;
295 } else {
296 ret = 1;
297 }
298 dFree(major1); dFree(major2);
299 dFree(minor1); dFree(minor2);
300 dFree(charset1); dFree(charset2);
301
302 return ret;
303}
304
322int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
323{
324 int i;
325 int st = -1;
326
327 _MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
328
329 if (!EntryType)
330 return 0; /* there's no mismatch without server type */
331
332 for (i = 1; MimeTypes[i].str; ++i)
333 if (dStrnAsciiCasecmp(EntryType, MimeTypes[i].str, MimeTypes[i].len) ==0)
334 break;
335
336 if (!MimeTypes[i].str) {
337 /* type not found, no mismatch */
338 st = 0;
339 } else if (dStrnAsciiCasecmp(EntryType, "image/", 6) == 0 &&
340 !dStrnAsciiCasecmp(DetectedType, MimeTypes[i].str,
341 MimeTypes[i].len)){
342 /* An image, and there's an exact match */
343 st = 0;
344 } else if (dStrnAsciiCasecmp(EntryType, "text/", 5) ||
345 dStrnAsciiCasecmp(DetectedType, "application/", 12)) {
346 /* Not an application sent as text */
347 st = 0;
348 } else if (dStrnAsciiCasecmp(EntryType, "application/xhtml+xml", 21) &&
349 dStrnAsciiCasecmp(DetectedType, "text/html", 9)) {
350 /* XML version of HTML */
351 st = 0;
352 }
353 _MSG("Type check: %s\n", st == 0 ? "MATCH" : "MISMATCH");
354
355 return st;
356}
357
361int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h)
362{
363 char *p, *t1, *t2;
364 int n1, n2;
365 int ret = 0;
366
367 if ((p = strchr(str, 'x')) || (p = strchr(str, 'X'))) {
368 n1 = strtol(str, &t1, 10);
369 n2 = strtol(++p, &t2, 10);
370 if (t1 != str && t2 != p) {
371 *w = n1;
372 *h = n2;
373 ret = 1;
374 /* parse x,y now */
375 p = t2;
376 n1 = strtol(p, &t1, 10);
377 n2 = strtol(t1, &t2, 10);
378 if (t1 != p && t2 != t1) {
379 *x = n1;
380 *y = n2;
381 }
382 }
383 }
384 _MSG("geom: w,h,x,y = (%d,%d,%d,%d)\n", *w, *h, *x, *y);
385 return ret;
386}
387
392int a_Misc_parse_search_url(char *source, char **label, char **urlstr)
393{
394 static char buf[32];
395 char *p, *q;
396 int ret = -1;
397
398 if ((p = strrchr(source, ' '))) {
399 /* label and url pair */
400 strncpy(buf,source,MIN(p-source,31));
401 buf[MIN(p-source,31)] = 0;
402 source = p+1;
403 if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
404 *urlstr = source;
405 ret = 0;
406 }
407 } else {
408 /* url only, make a custom label */
409 if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
410 strncpy(buf,p+2,MIN(q-p-2,31));
411 buf[MIN(q-p-2,31)] = 0;
412 *urlstr = source;
413 ret = 0;
414 }
415 }
416 *label = buf;
417 if (ret == -1)
418 MSG("Invalid search_url: \"%s\"\n", source);
419 return ret;
420}
421
426char *a_Misc_encode_base64(const char *in)
427{
428 static const char *const base64_hex = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
429 "abcdefghijklmnopqrstuvwxyz"
430 "0123456789+/";
431 char *out = NULL;
432 int len, i = 0;
433
434 if (in == NULL) return NULL;
435 len = strlen(in);
436
437 out = (char *)dMalloc((len + 2) / 3 * 4 + 1);
438
439 for (; len >= 3; len -= 3) {
440 out[i++] = base64_hex[in[0] >> 2];
441 out[i++] = base64_hex[((in[0]<<4) & 0x30) | (in[1]>>4)];
442 out[i++] = base64_hex[((in[1]<<2) & 0x3c) | (in[2]>>6)];
443 out[i++] = base64_hex[in[2] & 0x3f];
444 in += 3;
445 }
446
447 if (len > 0) {
448 unsigned char fragment;
449 out[i++] = base64_hex[in[0] >> 2];
450 fragment = (in[0] << 4) & 0x30;
451 if (len > 1) fragment |= in[1] >> 4;
452 out[i++] = base64_hex[fragment];
453 out[i++] = (len < 2) ? '=' : base64_hex[(in[1] << 2) & 0x3c];
454 out[i++] = '=';
455 }
456 out[i] = '\0';
457 return out;
458}
459
465Dstr *a_Misc_file2dstr(const char *filename)
466{
467 FILE *F_in;
468 int n;
469 char buf[4096];
470 Dstr *dstr = NULL;
471
472 if ((F_in = fopen(filename, "r"))) {
473 dstr = dStr_sized_new(4096);
474 while ((n = fread (buf, 1, 4096, F_in)) > 0) {
475 dStr_append_l(dstr, buf, n);
476 }
477 fclose(F_in);
478 }
479 return dstr;
480}
#define _MSG(...)
Definition bookmarks.c:44
#define MSG(...)
Definition bookmarks.c:45
unsigned char uchar_t
Definition d_size.h:17
unsigned int uint_t
Definition d_size.h:20
void dFree(void *mem)
Definition dlib.c:67
int dStrAsciiCasecmp(const char *s1, const char *s2)
Definition dlib.c:202
Dstr * dStr_sized_new(int sz)
Create a new string with a given size.
Definition dlib.c:253
int dStrnAsciiCasecmp(const char *s1, const char *s2, size_t n)
Definition dlib.c:214
void * dMalloc(size_t size)
Definition dlib.c:44
void dStr_free(Dstr *ds, int all)
Free a dillo string.
Definition dlib.c:336
char * dStriAsciiStr(const char *haystack, const char *needle)
Case insensitive strstr.
Definition dlib.c:183
void dStr_append_l(Dstr *ds, const char *s, int l)
Append a C string to a Dstr (providing length).
Definition dlib.c:307
void dStr_append_c(Dstr *ds, int c)
Append one character.
Definition dlib.c:348
char * dStrndup(const char *s, size_t sz)
Definition dlib.c:87
#define MIN(a, b)
Definition dlib.h:43
static int dIsspace(unsigned char c)
Definition dlib.h:53
static int dIsascii(unsigned char c)
Definition dlib.h:57
#define FALSE
Definition dlib.h:32
static int dIscntrl(unsigned char c)
Definition dlib.h:49
int a_Misc_parse_search_url(char *source, char **label, char **urlstr)
Parse dillorc's search_url string ([<label> ]<url>) Return value: -1 on error, 0 on success (and labe...
Definition misc.c:392
int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
Check the server-supplied 'Content-Type' against our detected type.
Definition misc.c:322
int a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
Takes a string and converts any tabs to spaces.
Definition misc.c:55
Dstr * a_Misc_file2dstr(const char *filename)
Load a local file into a dStr.
Definition misc.c:465
int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h)
Parse a geometry string.
Definition misc.c:361
char * a_Misc_encode_base64(const char *in)
Encodes string using base64 encoding.
Definition misc.c:426
DetectedContentType
Definition misc.c:117
@ DT_TEXT_PLAIN
Definition misc.c:121
@ DT_OCTET_STREAM
Definition misc.c:118
@ DT_IMAGE_GIF
Definition misc.c:122
@ DT_IMAGE_PNG
Definition misc.c:123
@ DT_TEXT_HTML
Definition misc.c:120
@ DT_PLACEHOLDER
Definition misc.c:119
@ DT_IMAGE_JPG
Definition misc.c:124
static const ContentType_t MimeTypes[]
Definition misc.c:106
#define TAB_SIZE
Definition misc.c:50
char * a_Misc_escape_chars(const char *str, const char *esc_set)
Escape characters as XX sequences.
Definition misc.c:27
int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
Compare two Content-Type strings.
Definition misc.c:275
int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
Detects 'Content-Type' from a data stream sample.
Definition misc.c:137
void a_Misc_parse_content_type(const char *type, char **major, char **minor, char **charset)
Parse Content-Type string, e.g., "text/html; charset=utf-8".
Definition misc.c:211
Definition dlib.h:131
Dstr_char_t * str
Definition dlib.h:134
uint_t a_Utf8_decode(const char *str, const char *end, int *len)
Decode a single UTF-8-encoded character starting at p.
Definition utf8.cc:46
int a_Utf8_test(const char *src, unsigned int srclen)
Examine first srclen bytes of src.
Definition utf8.cc:64
uint_t a_Utf8_end_of_char(const char *str, uint_t i)
Return index of the last byte of the UTF-8-encoded character that str + i points to or into.
Definition utf8.cc:23
bool_t a_Utf8_combining_char(int unicode)
Definition utf8.cc:96