dscrape

Duckduckgo CLI search/scraper
git clone git://git.codemadness.org/dscrape
Log | Files | Refs | README

xml.c (11059B)


      1 #include <sys/types.h>
      2 
      3 #include <ctype.h>
      4 #include <errno.h>
      5 #include <limits.h>
      6 #include <stdio.h>
      7 #include <stdlib.h>
      8 #include <string.h>
      9 
     10 #include "xml.h"
     11 
     12 static void
     13 xml_parseattrs(XMLParser *x)
     14 {
     15 	size_t namelen = 0, valuelen;
     16 	int c, endsep, endname = 0, valuestart = 0;
     17 
     18 	while ((c = x->getnext()) != EOF) {
     19 		if (isspace(c)) {
     20 			if (namelen)
     21 				endname = 1;
     22 			continue;
     23 		} else if (c == '?')
     24 			; /* ignore */
     25 		else if (c == '=') {
     26 			x->name[namelen] = '\0';
     27 			valuestart = 1;
     28 			endname = 1;
     29 		} else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
     30 			/* attribute without value */
     31 			x->name[namelen] = '\0';
     32 			if (x->xmlattrstart)
     33 				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
     34 			if (x->xmlattr)
     35 				x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
     36 			if (x->xmlattrend)
     37 				x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
     38 			endname = 0;
     39 			x->name[0] = c;
     40 			namelen = 1;
     41 		} else if (namelen && valuestart) {
     42 			/* attribute with value */
     43 			if (x->xmlattrstart)
     44 				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
     45 
     46 			valuelen = 0;
     47 			if (c == '\'' || c == '"') {
     48 				endsep = c;
     49 			} else {
     50 				endsep = ' '; /* isspace() */
     51 				goto startvalue;
     52 			}
     53 
     54 			while ((c = x->getnext()) != EOF) {
     55 startvalue:
     56 				if (c == '&') { /* entities */
     57 					x->data[valuelen] = '\0';
     58 					/* call data function with data before entity if there is data */
     59 					if (valuelen && x->xmlattr)
     60 						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     61 					x->data[0] = c;
     62 					valuelen = 1;
     63 					while ((c = x->getnext()) != EOF) {
     64 						if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
     65 							break;
     66 						if (valuelen < sizeof(x->data) - 1)
     67 							x->data[valuelen++] = c;
     68 						else {
     69 							/* entity too long for buffer, handle as normal data */
     70 							x->data[valuelen] = '\0';
     71 							if (x->xmlattr)
     72 								x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     73 							x->data[0] = c;
     74 							valuelen = 1;
     75 							break;
     76 						}
     77 						if (c == ';') {
     78 							x->data[valuelen] = '\0';
     79 							if (x->xmlattrentity)
     80 								x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     81 							valuelen = 0;
     82 							break;
     83 						}
     84 					}
     85 				} else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
     86 					if (valuelen < sizeof(x->data) - 1) {
     87 						x->data[valuelen++] = c;
     88 					} else {
     89 						x->data[valuelen] = '\0';
     90 						if (x->xmlattr)
     91 							x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     92 						x->data[0] = c;
     93 						valuelen = 1;
     94 					}
     95 				}
     96 				if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
     97 					x->data[valuelen] = '\0';
     98 					if (x->xmlattr)
     99 						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
    100 					if (x->xmlattrend)
    101 						x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
    102 					break;
    103 				}
    104 			}
    105 			namelen = endname = valuestart = 0;
    106 		} else if (namelen < sizeof(x->name) - 1) {
    107 			x->name[namelen++] = c;
    108 		}
    109 		if (c == '>') {
    110 			break;
    111 		} else if (c == '/') {
    112 			x->isshorttag = 1;
    113 			x->name[0] = '\0';
    114 			namelen = 0;
    115 		}
    116 	}
    117 }
    118 
    119 static void
    120 xml_parsecomment(XMLParser *x)
    121 {
    122 	size_t datalen = 0, i = 0;
    123 	int c;
    124 
    125 	if (x->xmlcommentstart)
    126 		x->xmlcommentstart(x);
    127 	while ((c = x->getnext()) != EOF) {
    128 		if (c == '-' || c == '>') {
    129 			if (x->xmlcomment) {
    130 				x->data[datalen] = '\0';
    131 				x->xmlcomment(x, x->data, datalen);
    132 				datalen = 0;
    133 			}
    134 		}
    135 
    136 		if (c == '-') {
    137 			if (++i > 2) {
    138 				if (x->xmlcomment)
    139 					for (; i > 2; i--)
    140 						x->xmlcomment(x, "-", 1);
    141 				i = 2;
    142 			}
    143 			continue;
    144 		} else if (c == '>' && i == 2) {
    145 			if (x->xmlcommentend)
    146 				x->xmlcommentend(x);
    147 			return;
    148 		} else if (i) {
    149 			if (x->xmlcomment) {
    150 				for (; i > 0; i--)
    151 					x->xmlcomment(x, "-", 1);
    152 			}
    153 			i = 0;
    154 		}
    155 
    156 		if (datalen < sizeof(x->data) - 1) {
    157 			x->data[datalen++] = c;
    158 		} else {
    159 			x->data[datalen] = '\0';
    160 			if (x->xmlcomment)
    161 				x->xmlcomment(x, x->data, datalen);
    162 			x->data[0] = c;
    163 			datalen = 1;
    164 		}
    165 	}
    166 }
    167 
    168 static void
    169 xml_parsecdata(XMLParser *x)
    170 {
    171 	size_t datalen = 0, i = 0;
    172 	int c;
    173 
    174 	if (x->xmlcdatastart)
    175 		x->xmlcdatastart(x);
    176 	while ((c = x->getnext()) != EOF) {
    177 		if (c == ']' || c == '>') {
    178 			if (x->xmlcdata) {
    179 				x->data[datalen] = '\0';
    180 				x->xmlcdata(x, x->data, datalen);
    181 				datalen = 0;
    182 			}
    183 		}
    184 
    185 		if (c == ']') {
    186 			if (++i > 2) {
    187 				if (x->xmlcdata)
    188 					for (; i > 2; i--)
    189 						x->xmlcdata(x, "]", 1);
    190 				i = 2;
    191 			}
    192 			continue;
    193 		} else if (c == '>' && i == 2) {
    194 			if (x->xmlcdataend)
    195 				x->xmlcdataend(x);
    196 			return;
    197 		} else if (i) {
    198 			if (x->xmlcdata)
    199 				for (; i > 0; i--)
    200 					x->xmlcdata(x, "]", 1);
    201 			i = 0;
    202 		}
    203 
    204 		if (datalen < sizeof(x->data) - 1) {
    205 			x->data[datalen++] = c;
    206 		} else {
    207 			x->data[datalen] = '\0';
    208 			if (x->xmlcdata)
    209 				x->xmlcdata(x, x->data, datalen);
    210 			x->data[0] = c;
    211 			datalen = 1;
    212 		}
    213 	}
    214 }
    215 
    216 static int
    217 codepointtoutf8(long r, char *s)
    218 {
    219 	if (r == 0) {
    220 		return 0; /* NUL byte */
    221 	} else if (r <= 0x7F) {
    222 		/* 1 byte: 0aaaaaaa */
    223 		s[0] = r;
    224 		return 1;
    225 	} else if (r <= 0x07FF) {
    226 		/* 2 bytes: 00000aaa aabbbbbb */
    227 		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
    228 		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
    229 		return 2;
    230 	} else if (r <= 0xFFFF) {
    231 		/* 3 bytes: aaaabbbb bbcccccc */
    232 		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
    233 		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
    234 		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
    235 		return 3;
    236 	} else {
    237 		/* 4 bytes: 000aaabb bbbbcccc ccdddddd */
    238 		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
    239 		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
    240 		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
    241 		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
    242 		return 4;
    243 	}
    244 }
    245 
    246 static int
    247 namedentitytostr(const char *e, char *buf, size_t bufsiz)
    248 {
    249 	static const struct {
    250 		char *entity;
    251 		int c;
    252 	} entities[] = {
    253 		{ "&amp;",  '&'  },
    254 		{ "&lt;",   '<'  },
    255 		{ "&gt;",   '>'  },
    256 		{ "&apos;", '\'' },
    257 		{ "&quot;", '"'  },
    258 		{ "&AMP;",  '&'  },
    259 		{ "&LT;",   '<'  },
    260 		{ "&GT;",   '>'  },
    261 		{ "&APOS;", '\'' },
    262 		{ "&QUOT;", '"'  }
    263 	};
    264 	size_t i;
    265 
    266 	/* buffer is too small */
    267 	if (bufsiz < 2)
    268 		return -1;
    269 
    270 	/* doesn't start with &: can't match */
    271 	if (*e != '&')
    272 		return 0;
    273 
    274 	for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
    275 		if (!strcmp(e, entities[i].entity)) {
    276 			buf[0] = entities[i].c;
    277 			buf[1] = '\0';
    278 			return 1;
    279 		}
    280 	}
    281 	return 0;
    282 }
    283 
    284 static int
    285 numericentitytostr(const char *e, char *buf, size_t bufsiz)
    286 {
    287 	long l;
    288 	int len;
    289 	char *end;
    290 
    291 	/* buffer is too small */
    292 	if (bufsiz < 5)
    293 		return -1;
    294 
    295 	/* not a numeric entity */
    296 	if (e[0] != '&' || e[1] != '#')
    297 		return 0;
    298 
    299 	/* e[1] == '#', numeric / hexadecimal entity */
    300 	e += 2; /* skip "&#" */
    301 	errno = 0;
    302 	/* hex (16) or decimal (10) */
    303 	if (*e == 'x')
    304 		l = strtoul(e + 1, &end, 16);
    305 	else
    306 		l = strtoul(e, &end, 10);
    307 	/* invalid value or not a well-formed entity or too high codepoint */
    308 	if (errno || *end != ';' || l > 0x10FFFF)
    309 		return 0;
    310 	len = codepointtoutf8(l, buf);
    311 	buf[len] = '\0';
    312 
    313 	return len;
    314 }
    315 
    316 /* convert named- or numeric entity string to buffer string
    317  * returns byte-length of string. */
    318 int
    319 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
    320 {
    321 	/* buffer is too small */
    322 	if (bufsiz < 5)
    323 		return -1;
    324 	/* doesn't start with & */
    325 	if (e[0] != '&')
    326 		return 0;
    327 	/* named entity */
    328 	if (e[1] != '#')
    329 		return namedentitytostr(e, buf, bufsiz);
    330 	else /* numeric entity */
    331 		return numericentitytostr(e, buf, bufsiz);
    332 }
    333 
    334 void
    335 xml_parse(XMLParser *x)
    336 {
    337 	int c, ispi;
    338 	size_t datalen, tagdatalen, taglen;
    339 
    340 	if (!x->getnext)
    341 		return;
    342 	while ((c = x->getnext()) != EOF && c != '<')
    343 		; /* skip until < */
    344 
    345 	while (c != EOF) {
    346 		if (c == '<') { /* parse tag */
    347 			if ((c = x->getnext()) == EOF)
    348 				return;
    349 
    350 			if (c == '!') { /* cdata and comments */
    351 				for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
    352 					/* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
    353 					if (tagdatalen <= sizeof("[CDATA[") - 1)
    354 						x->data[tagdatalen++] = c;
    355 					if (c == '>')
    356 						break;
    357 					else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
    358 							(x->data[0] == '-')) {
    359 						xml_parsecomment(x);
    360 						break;
    361 					} else if (c == '[') {
    362 						if (tagdatalen == sizeof("[CDATA[") - 1 &&
    363 						    !strncmp(x->data, "[CDATA[", tagdatalen)) {
    364 							xml_parsecdata(x);
    365 							break;
    366 						}
    367 					}
    368 				}
    369 			} else {
    370 				x->tag[0] = '\0';
    371 				x->taglen = 0;
    372 
    373 				/* normal tag (open, short open, close), processing instruction. */
    374 				if (isspace(c))
    375 					while ((c = x->getnext()) != EOF && isspace(c))
    376 						;
    377 				if (c == EOF)
    378 					return;
    379 				x->tag[0] = c;
    380 				ispi = (c == '?') ? 1 : 0;
    381 				x->isshorttag = ispi;
    382 				taglen = 1;
    383 				while ((c = x->getnext()) != EOF) {
    384 					if (c == '/')
    385 						x->isshorttag = 1; /* short tag */
    386 					else if (c == '>' || isspace(c)) {
    387 						x->tag[taglen] = '\0';
    388 						if (x->tag[0] == '/') { /* end tag, starts with </ */
    389 							x->taglen = --taglen; /* len -1 because of / */
    390 							if (taglen && x->xmltagend)
    391 								x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
    392 						} else {
    393 							x->taglen = taglen;
    394 							/* start tag */
    395 							if (x->xmltagstart)
    396 								x->xmltagstart(x, x->tag, x->taglen);
    397 							if (isspace(c))
    398 								xml_parseattrs(x);
    399 							if (x->xmltagstartparsed)
    400 								x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
    401 						}
    402 						/* call tagend for shortform or processing instruction */
    403 						if ((x->isshorttag || ispi) && x->xmltagend)
    404 							x->xmltagend(x, x->tag, x->taglen, 1);
    405 						break;
    406 					} else if (taglen < sizeof(x->tag) - 1)
    407 						x->tag[taglen++] = c; /* NOTE: tag name truncation */
    408 				}
    409 			}
    410 		} else {
    411 			/* parse tag data */
    412 			datalen = 0;
    413 			if (x->xmldatastart)
    414 				x->xmldatastart(x);
    415 			while ((c = x->getnext()) != EOF) {
    416 				if (c == '&') {
    417 					if (datalen) {
    418 						x->data[datalen] = '\0';
    419 						if (x->xmldata)
    420 							x->xmldata(x, x->data, datalen);
    421 					}
    422 					x->data[0] = c;
    423 					datalen = 1;
    424 					while ((c = x->getnext()) != EOF) {
    425 						if (c == '<')
    426 							break;
    427 						if (datalen < sizeof(x->data) - 1)
    428 							x->data[datalen++] = c;
    429 						else {
    430 							/* entity too long for buffer, handle as normal data */
    431 							x->data[datalen] = '\0';
    432 							if (x->xmldata)
    433 								x->xmldata(x, x->data, datalen);
    434 							x->data[0] = c;
    435 							datalen = 1;
    436 							break;
    437 						}
    438 						if (c == ';') {
    439 							x->data[datalen] = '\0';
    440 							if (x->xmldataentity)
    441 								x->xmldataentity(x, x->data, datalen);
    442 							datalen = 0;
    443 							break;
    444 						}
    445 					}
    446 				} else if (c != '<') {
    447 					if (datalen < sizeof(x->data) - 1) {
    448 						x->data[datalen++] = c;
    449 					} else {
    450 						x->data[datalen] = '\0';
    451 						if (x->xmldata)
    452 							x->xmldata(x, x->data, datalen);
    453 						x->data[0] = c;
    454 						datalen = 1;
    455 					}
    456 				}
    457 				if (c == '<') {
    458 					x->data[datalen] = '\0';
    459 					if (x->xmldata && datalen)
    460 						x->xmldata(x, x->data, datalen);
    461 					if (x->xmldataend)
    462 						x->xmldataend(x);
    463 					break;
    464 				}
    465 			}
    466 		}
    467 	}
    468 }