dscrape

Duckduckgo CLI search/scraper
git clone git://git.codemadness.org/dscrape
Log | Files | Refs | README

commit f63dba8df97ecc58bdffdb4e74b056571fef3513
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Fri,  1 Mar 2019 12:09:57 +0100

initial insertion

Diffstat:
AMakefile | 5+++++
AREADME | 3+++
Aduckduckgo | 9+++++++++
Amain.c | 353+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Axml.c | 468+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Axml.h | 40++++++++++++++++++++++++++++++++++++++++
6 files changed, 878 insertions(+), 0 deletions(-)

diff --git a/Makefile b/Makefile @@ -0,0 +1,5 @@ +build: clean + cc xml.c main.c -o dscrape ${CFLAGS} ${LDFLAGS} + +clean: + rm -f dscrape *.o diff --git a/README b/README @@ -0,0 +1,3 @@ +duckduckgo CLI search tool + +work-in-progress, do not use. diff --git a/duckduckgo b/duckduckgo @@ -0,0 +1,9 @@ +#!/bin/sh +if test x"$1" = x""; then + echo "usage: $0 <keywords>" >&2 + exit 1 +fi + +hurl -m 1024000 -t 15 "https://duckduckgo.com/html/?q=$1" | ./dscrape + +#curl -H 'User-Agent:' "https://duckduckgo.com/html/?q=$1" diff --git a/main.c b/main.c @@ -0,0 +1,353 @@ +#include <sys/types.h> + +#include <ctype.h> +#include <err.h> +#include <locale.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <wchar.h> + +#include "xml.h" + +static XMLParser x; + +static struct result { + char title[1024]; + char url[1024]; + char urldecoded[1024]; + char description[4096]; +} result; + +static int istitle, isdescription, isurl, isresult; + +void +sanitize(char *s, size_t len) +{ + size_t i; + + /* trim trailing whitespace */ + for (i = strlen(s); i > 0; i--) { + if (!isspace((unsigned char)s[i - 1])) + break; + } + s[i] = '\0'; + + /* trim leading whitespace */ + for (i = 0; *s; i++) { + if (!isspace((unsigned char)s[i])) + break; + } + memmove(s, s + i, len - i + 1); + + for (i = 0; s[i]; i++) { + if (iscntrl((unsigned char)s[i])) + s[i] = ' '; + } +} + +/* format `len' columns of characters. If string is shorter pad the rest + * with characters `pad`. */ +int +utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad) +{ + wchar_t wc; + size_t col = 0, i, slen, siz = 0; + int rl, w; + + if (!len) + return -1; + + slen = strlen(s); + for (i = 0; i < slen; i += rl) { + if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0) + break; + if ((w = wcwidth(wc)) == -1) + continue; + if (col + w > len || (col + w == len && s[i + rl])) { + if (siz + 4 >= bufsiz) + return -1; + memcpy(&buf[siz], "\xe2\x80\xa6", 3); + siz += 3; + if (col + w == len && w > 1) + buf[siz++] = pad; + buf[siz] = '\0'; + return 0; + } + if (siz + rl + 1 >= bufsiz) + return -1; + memcpy(&buf[siz], &s[i], rl); + col += w; + siz += rl; + buf[siz] = '\0'; + } + + len -= col; + if (siz + len + 1 >= bufsiz) + return -1; + memset(&buf[siz], pad, len); + siz += len; + buf[siz] = '\0'; + + return 0; +} + +/* Escape characters in links in geomyidae .gph format */ +void +gphlink(FILE *fp, const char *s, size_t len) +{ + size_t i; + + for (i = 0; *s && i < len; s++, i++) { + switch (*s) { + case '\r': /* ignore CR */ + case '\n': /* ignore LF */ + break; + case '\t': + fputs(" ", fp); + break; + case '|': /* escape separators */ + fputs("\\|", fp); + break; + default: + fputc(*s, fp); + break; + } + } +} + +/* print `len' columns of characters. If string is shorter pad the rest with + * characters `pad`. */ +void +printutf8pad(const char *s, size_t len, int pad) +{ + wchar_t wc; + size_t col = 0, i, slen; + int rl, w; + + if (!len) + return; + + slen = strlen(s); + for (i = 0; i < slen; i += rl) { + if ((rl = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4)) <= 0) + break; + if ((w = wcwidth(wc)) == -1) + continue; + if (col + w > len || (col + w == len && s[i + rl])) { + fputs("\xe2\x80\xa6", stdout); + col++; + break; + } + fwrite(&s[i], 1, rl, stdout); + col += w; + } + for (; col < len; ++col) + putchar(pad); +} + +int +hexdigit(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + + return 0; +} + +/* decode until NUL separator or end of "key". */ +int +decodeparam(char *buf, size_t bufsiz, const char *s) +{ + size_t i; + + if (!bufsiz) + return -1; + + for (i = 0; *s && *s != '&'; s++) { + if (i + 3 >= bufsiz) + return -1; + switch (*s) { + case '%': + if (!isxdigit(*(s+1)) || !isxdigit(*(s+2))) + return -1; + buf[i++] = hexdigit(*(s+1)) * 16 + hexdigit(*(s+2)); + s += 2; + break; + case '+': + buf[i++] = ' '; + break; + default: + buf[i++] = *s; + break; + } + } + buf[i] = '\0'; + + return i; +} + +void +xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + if (!strcmp(t, "div") && !strcmp(a, "class") && strstr(v, "results_links")) + isresult = 1; + + if (!isresult) + return; + + if (!strcmp(t, "h2") && !strcmp(a, "class") && strstr(v, "result__title")) + istitle = 1; + if (!strcmp(t, "a") && !strcmp(a, "class") && strstr(v, "result__snippet")) + isdescription = 1; + if (!strcmp(t, "a") && !strcmp(a, "class") && strstr(v, "result__url")) + isurl = 1; + if (isurl && !strcmp(t, "a") && !strcmp(a, "href")) { + strlcpy(result.url, v, sizeof(result.url)); + } +} + +void +xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + char buf[16]; + ssize_t len; + + if (!isresult || !istitle || !isdescription || !isurl) + return; + + if ((len = xml_entitytostr(v, buf, sizeof(buf))) > 0) + xmlattr(x, t, tl, a, al, buf, (size_t)len); + else + xmlattr(x, t, tl, a, al, v, vl); +} + +void +xmldata(XMLParser *x, const char *d, size_t dl) +{ + if (istitle) + strlcat(result.title, d, sizeof(result.title)); + if (isdescription) + strlcat(result.description, d, sizeof(result.description)); +} + +void +xmlcdata(XMLParser *x, const char *d, size_t dl) +{ + xmldata(x, d, dl); +} + +void +xmldataentity(XMLParser *x, const char *d, size_t dl) +{ + char buf[16]; + ssize_t len; + + if (!isresult || !istitle || !isdescription || !isurl) + return; + + if ((len = xml_entitytostr(d, buf, sizeof(buf))) > 0) + xmldata(x, buf, (size_t)len); + else + xmldata(x, d, dl); +} + +void +xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) +{ + if (isresult) { + if (isdescription) { + if (!strcmp(t, "b")) + strlcat(result.description, "*", sizeof(result.description)); + } + + if (istitle && !strcmp(t, "h2")) + istitle = 0; + if (isdescription && !strcmp(t, "a")) + isdescription = 0; + if (isurl && !strcmp(t, "a")) + isurl = 0; + if (!strcmp(t, "div")) { + /* decode url and remove "tracking"/usage part via DDG */ + if (!strncmp(result.url, "uddg=", sizeof("uddg=") - 1)) { + if (decodeparam(result.urldecoded, sizeof(result.urldecoded), + result.url + sizeof("uddg=") - 1) == -1) + result.urldecoded[0] = '\0'; + } + + sanitize(result.title, strlen(result.title)); + sanitize(result.urldecoded, strlen(result.urldecoded)); + // XXX: for gopher only. + sanitize(result.description, strlen(result.description)); + +//#define GOPHER +#ifdef GOPHER + // TODO: encode for gphlink, see stagit-gopher */ + fputs("[h|", stdout); + fputs(result.title, stdout); + fputs("|", stdout); + fputs(result.urldecoded, stdout); + fputs("|server|port]\n", stdout); + + fputs("[h|", stdout); + fputs(result.urldecoded, stdout); + fputs("|", stdout); + fputs(result.urldecoded, stdout); + fputs("|server|port]\n", stdout); + + fputs("[h|", stdout); + printutf8pad(result.description, 79, ' '); + fputs("|", stdout); + fputs(result.urldecoded, stdout); + fputs("|server|port]\n", stdout); + + fputs("\n\n", stdout); +#else + printutf8pad(result.title, 70, ' '); + fputs(" ", stdout); + puts(result.urldecoded); +#endif + + isresult = istitle = isdescription = isurl = 0; + memset(&result, 0, sizeof(result)); + } + } +} + +void +xmltagstart(XMLParser *x, const char *t, size_t tl) +{ + if (isdescription && !strcmp(t, "b")) + strlcat(result.description, "*", sizeof(result.description)); + +} + +int +main(void) +{ + setlocale(LC_CTYPE, ""); + + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + + x.xmlattr = xmlattr; + x.xmlattrentity = xmlattrentity; + x.xmlcdata = xmlcdata; + x.xmldata = xmldata; + x.xmldataentity = xmldataentity; + x.xmltagend = xmltagend; + x.xmltagstart = xmltagstart; + + x.getnext = getchar; + + xml_parse(&x); + + return 0; +} diff --git a/xml.c b/xml.c @@ -0,0 +1,468 @@ +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xml.h" + +static void +xml_parseattrs(XMLParser *x) +{ + size_t namelen = 0, valuelen; + int c, endsep, endname = 0, valuestart = 0; + + while ((c = x->getnext()) != EOF) { + if (isspace(c)) { + if (namelen) + endname = 1; + continue; + } else if (c == '?') + ; /* ignore */ + else if (c == '=') { + x->name[namelen] = '\0'; + valuestart = 1; + endname = 1; + } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) { + /* attribute without value */ + x->name[namelen] = '\0'; + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + endname = 0; + x->name[0] = c; + namelen = 1; + } else if (namelen && valuestart) { + /* attribute with value */ + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + + valuelen = 0; + if (c == '\'' || c == '"') { + endsep = c; + } else { + endsep = ' '; /* isspace() */ + goto startvalue; + } + + while ((c = x->getnext()) != EOF) { +startvalue: + if (c == '&') { /* entities */ + x->data[valuelen] = '\0'; + /* call data function with data before entity if there is data */ + if (valuelen && x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + while ((c = x->getnext()) != EOF) { + if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) + break; + if (valuelen < sizeof(x->data) - 1) + x->data[valuelen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + break; + } + if (c == ';') { + x->data[valuelen] = '\0'; + if (x->xmlattrentity) + x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + valuelen = 0; + break; + } + } + } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) { + if (valuelen < sizeof(x->data) - 1) { + x->data[valuelen++] = c; + } else { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + } + } + if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + break; + } + } + namelen = endname = valuestart = 0; + } else if (namelen < sizeof(x->name) - 1) { + x->name[namelen++] = c; + } + if (c == '>') { + break; + } else if (c == '/') { + x->isshorttag = 1; + x->name[0] = '\0'; + namelen = 0; + } + } +} + +static void +xml_parsecomment(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcommentstart) + x->xmlcommentstart(x); + while ((c = x->getnext()) != EOF) { + if (c == '-' || c == '>') { + if (x->xmlcomment) { + x->data[datalen] = '\0'; + x->xmlcomment(x, x->data, datalen); + datalen = 0; + } + } + + if (c == '-') { + if (++i > 2) { + if (x->xmlcomment) + for (; i > 2; i--) + x->xmlcomment(x, "-", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcommentend) + x->xmlcommentend(x); + return; + } else if (i) { + if (x->xmlcomment) { + for (; i > 0; i--) + x->xmlcomment(x, "-", 1); + } + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcomment) + x->xmlcomment(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static void +xml_parsecdata(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcdatastart) + x->xmlcdatastart(x); + while ((c = x->getnext()) != EOF) { + if (c == ']' || c == '>') { + if (x->xmlcdata) { + x->data[datalen] = '\0'; + x->xmlcdata(x, x->data, datalen); + datalen = 0; + } + } + + if (c == ']') { + if (++i > 2) { + if (x->xmlcdata) + for (; i > 2; i--) + x->xmlcdata(x, "]", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcdataend) + x->xmlcdataend(x); + return; + } else if (i) { + if (x->xmlcdata) + for (; i > 0; i--) + x->xmlcdata(x, "]", 1); + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcdata) + x->xmlcdata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static int +codepointtoutf8(long r, char *s) +{ + if (r == 0) { + return 0; /* NUL byte */ + } else if (r <= 0x7F) { + /* 1 byte: 0aaaaaaa */ + s[0] = r; + return 1; + } else if (r <= 0x07FF) { + /* 2 bytes: 00000aaa aabbbbbb */ + s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ + s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ + return 2; + } else if (r <= 0xFFFF) { + /* 3 bytes: aaaabbbb bbcccccc */ + s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ + s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ + s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ + return 3; + } else { + /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ + s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ + s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ + s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ + s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ + return 4; + } +} + +static int +namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ + static const struct { + char *entity; + int c; + } entities[] = { + { "&amp;", '&' }, + { "&lt;", '<' }, + { "&gt;", '>' }, + { "&apos;", '\'' }, + { "&quot;", '"' }, + { "&AMP;", '&' }, + { "&LT;", '<' }, + { "&GT;", '>' }, + { "&APOS;", '\'' }, + { "&QUOT;", '"' } + }; + size_t i; + + /* buffer is too small */ + if (bufsiz < 2) + return -1; + + /* doesn't start with &: can't match */ + if (*e != '&') + return 0; + + for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { + if (!strcmp(e, entities[i].entity)) { + buf[0] = entities[i].c; + buf[1] = '\0'; + return 1; + } + } + return 0; +} + +static int +numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ + long l; + int len; + char *end; + + /* buffer is too small */ + if (bufsiz < 5) + return -1; + + /* not a numeric entity */ + if (e[0] != '&' || e[1] != '#') + return 0; + + /* e[1] == '#', numeric / hexadecimal entity */ + e += 2; /* skip "&#" */ + errno = 0; + /* hex (16) or decimal (10) */ + if (*e == 'x') + l = strtoul(e + 1, &end, 16); + else + l = strtoul(e, &end, 10); + /* invalid value or not a well-formed entity or too high codepoint */ + if (errno || *end != ';' || l > 0x10FFFF) + return 0; + len = codepointtoutf8(l, buf); + buf[len] = '\0'; + + return len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string. */ +int +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ + /* buffer is too small */ + if (bufsiz < 5) + return -1; + /* doesn't start with & */ + if (e[0] != '&') + return 0; + /* named entity */ + if (e[1] != '#') + return namedentitytostr(e, buf, bufsiz); + else /* numeric entity */ + return numericentitytostr(e, buf, bufsiz); +} + +void +xml_parse(XMLParser *x) +{ + int c, ispi; + size_t datalen, tagdatalen, taglen; + + if (!x->getnext) + return; + while ((c = x->getnext()) != EOF && c != '<') + ; /* skip until < */ + + while (c != EOF) { + if (c == '<') { /* parse tag */ + if ((c = x->getnext()) == EOF) + return; + + if (c == '!') { /* cdata and comments */ + for (tagdatalen = 0; (c = x->getnext()) != EOF;) { + /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */ + if (tagdatalen <= sizeof("[CDATA[") - 1) + x->data[tagdatalen++] = c; + if (c == '>') + break; + else if (c == '-' && tagdatalen == sizeof("--") - 1 && + (x->data[0] == '-')) { + xml_parsecomment(x); + break; + } else if (c == '[') { + if (tagdatalen == sizeof("[CDATA[") - 1 && + !strncmp(x->data, "[CDATA[", tagdatalen)) { + xml_parsecdata(x); + break; + } + } + } + } else { + x->tag[0] = '\0'; + x->taglen = 0; + + /* normal tag (open, short open, close), processing instruction. */ + if (isspace(c)) + while ((c = x->getnext()) != EOF && isspace(c)) + ; + if (c == EOF) + return; + x->tag[0] = c; + ispi = (c == '?') ? 1 : 0; + x->isshorttag = ispi; + taglen = 1; + while ((c = x->getnext()) != EOF) { + if (c == '/') + x->isshorttag = 1; /* short tag */ + else if (c == '>' || isspace(c)) { + x->tag[taglen] = '\0'; + if (x->tag[0] == '/') { /* end tag, starts with </ */ + x->taglen = --taglen; /* len -1 because of / */ + if (taglen && x->xmltagend) + x->xmltagend(x, &(x->tag)[1], x->taglen, 0); + } else { + x->taglen = taglen; + /* start tag */ + if (x->xmltagstart) + x->xmltagstart(x, x->tag, x->taglen); + if (isspace(c)) + xml_parseattrs(x); + if (x->xmltagstartparsed) + x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); + } + /* call tagend for shortform or processing instruction */ + if ((x->isshorttag || ispi) && x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, 1); + break; + } else if (taglen < sizeof(x->tag) - 1) + x->tag[taglen++] = c; /* NOTE: tag name truncation */ + } + } + } else { + /* parse tag data */ + datalen = 0; + if (x->xmldatastart) + x->xmldatastart(x); + while ((c = x->getnext()) != EOF) { + if (c == '&') { + if (datalen) { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + } + x->data[0] = c; + datalen = 1; + while ((c = x->getnext()) != EOF) { + if (c == '<') + break; + if (datalen < sizeof(x->data) - 1) + x->data[datalen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + break; + } + if (c == ';') { + x->data[datalen] = '\0'; + if (x->xmldataentity) + x->xmldataentity(x, x->data, datalen); + datalen = 0; + break; + } + } + } else if (c != '<') { + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } + if (c == '<') { + x->data[datalen] = '\0'; + if (x->xmldata && datalen) + x->xmldata(x, x->data, datalen); + if (x->xmldataend) + x->xmldataend(x); + break; + } + } + } + } +} diff --git a/xml.h b/xml.h @@ -0,0 +1,40 @@ +typedef struct xmlparser { + /* handlers */ + void (*xmlattr)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlattrend)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrstart)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrentity)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlcdatastart)(struct xmlparser *); + void (*xmlcdata)(struct xmlparser *, const char *, size_t); + void (*xmlcdataend)(struct xmlparser *); + void (*xmlcommentstart)(struct xmlparser *); + void (*xmlcomment)(struct xmlparser *, const char *, size_t); + void (*xmlcommentend)(struct xmlparser *); + void (*xmldata)(struct xmlparser *, const char *, size_t); + void (*xmldataend)(struct xmlparser *); + void (*xmldataentity)(struct xmlparser *, const char *, size_t); + void (*xmldatastart)(struct xmlparser *); + void (*xmltagend)(struct xmlparser *, const char *, size_t, int); + void (*xmltagstart)(struct xmlparser *, const char *, size_t); + void (*xmltagstartparsed)(struct xmlparser *, const char *, + size_t, int); + + int (*getnext)(void); + + /* current tag */ + char tag[1024]; + size_t taglen; + /* current tag is in short form ? <tag /> */ + int isshorttag; + /* current attribute name */ + char name[1024]; + /* data buffer used for tag data, cdata and attribute data */ + char data[BUFSIZ]; +} XMLParser; + +int xml_entitytostr(const char *, char *, size_t); +void xml_parse(XMLParser *);