dscrape

Duckduckgo CLI search/scraper
git clone git://git.codemadness.org/dscrape
Log | Files | Refs | README

main.c (7177B)


      1 #include <sys/types.h>
      2 
      3 #include <ctype.h>
      4 #include <err.h>
      5 #include <locale.h>
      6 #include <stdio.h>
      7 #include <stdlib.h>
      8 #include <string.h>
      9 #include <unistd.h>
     10 #include <wchar.h>
     11 
     12 #include "xml.h"
     13 
     14 static XMLParser x;
     15 
     16 static struct result {
     17 	char title[1024];
     18 	char url[1024];
     19 	char urldecoded[1024];
     20 	char description[4096];
     21 } result;
     22 
     23 static int istitle, isdescription, isurl, isresult;
     24 
     25 void
     26 sanitize(char *s, size_t len)
     27 {
     28 	size_t i;
     29 
     30 	/* trim trailing whitespace */
     31 	for (i = strlen(s); i > 0; i--) {
     32 		if (!isspace((unsigned char)s[i - 1]))
     33 			break;
     34 	}
     35 	s[i] = '\0';
     36 
     37 	/* trim leading whitespace */
     38 	for (i = 0; *s; i++) {
     39 		if (!isspace((unsigned char)s[i]))
     40 			break;
     41 	}
     42 	memmove(s, s + i, len - i + 1);
     43 
     44 	for (i = 0; s[i]; i++) {
     45 		if (iscntrl((unsigned char)s[i]))
     46 			s[i] = ' ';
     47 	}
     48 }
     49 
     50 /* format `len' columns of characters. If string is shorter pad the rest
     51  * with characters `pad`. */
     52 int
     53 utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
     54 {
     55 	wchar_t wc;
     56 	size_t col = 0, i, slen, siz = 0;
     57 	int rl, w;
     58 
     59 	if (!len)
     60 		return -1;
     61 
     62 	slen = strlen(s);
     63 	for (i = 0; i < slen; i += rl) {
     64 		if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
     65 			break;
     66 		if ((w = wcwidth(wc)) == -1)
     67 			continue;
     68 		if (col + w > len || (col + w == len && s[i + rl])) {
     69 			if (siz + 4 >= bufsiz)
     70 				return -1;
     71 			memcpy(&buf[siz], "\xe2\x80\xa6", 3);
     72 			siz += 3;
     73 			if (col + w == len && w > 1)
     74 				buf[siz++] = pad;
     75 			buf[siz] = '\0';
     76 			return 0;
     77 		}
     78 		if (siz + rl + 1 >= bufsiz)
     79 			return -1;
     80 		memcpy(&buf[siz], &s[i], rl);
     81 		col += w;
     82 		siz += rl;
     83 		buf[siz] = '\0';
     84 	}
     85 
     86 	len -= col;
     87 	if (siz + len + 1 >= bufsiz)
     88 		return -1;
     89 	memset(&buf[siz], pad, len);
     90 	siz += len;
     91 	buf[siz] = '\0';
     92 
     93 	return 0;
     94 }
     95 
     96 /* Escape characters in links in geomyidae .gph format */
     97 void
     98 gphlink(FILE *fp, const char *s, size_t len)
     99 {
    100 	size_t i;
    101 
    102 	for (i = 0; *s && i < len; s++, i++) {
    103 		switch (*s) {
    104 		case '\r': /* ignore CR */
    105 		case '\n': /* ignore LF */
    106 			break;
    107 		case '\t':
    108 			fputs("        ", fp);
    109 			break;
    110 		case '|': /* escape separators */
    111 			fputs("\\|", fp);
    112 			break;
    113 		default:
    114 			fputc(*s, fp);
    115 			break;
    116 		}
    117 	}
    118 }
    119 
    120 /* print `len' columns of characters. If string is shorter pad the rest with
    121  * characters `pad`. */
    122 void
    123 printutf8pad(const char *s, size_t len, int pad)
    124 {
    125 	wchar_t wc;
    126 	size_t col = 0, i, slen;
    127 	int rl, w;
    128 
    129 	if (!len)
    130 		return;
    131 
    132 	slen = strlen(s);
    133 	for (i = 0; i < slen; i += rl) {
    134 		if ((rl = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4)) <= 0)
    135 			break;
    136 		if ((w = wcwidth(wc)) == -1)
    137 			continue;
    138 		if (col + w > len || (col + w == len && s[i + rl])) {
    139 			fputs("\xe2\x80\xa6", stdout);
    140 			col++;
    141 			break;
    142 		}
    143 		fwrite(&s[i], 1, rl, stdout);
    144 		col += w;
    145 	}
    146 	for (; col < len; ++col)
    147 		putchar(pad);
    148 }
    149 
    150 int
    151 hexdigit(int c)
    152 {
    153 	if (c >= '0' && c <= '9')
    154 		return c - '0';
    155 	else if (c >= 'A' && c <= 'F')
    156 		return c - 'A' + 10;
    157 	else if (c >= 'a' && c <= 'f')
    158 		return c - 'a' + 10;
    159 
    160 	return 0;
    161 }
    162 
    163 /* decode until NUL separator or end of "key". */
    164 int
    165 decodeparam(char *buf, size_t bufsiz, const char *s)
    166 {
    167 	size_t i;
    168 
    169 	if (!bufsiz)
    170 		return -1;
    171 
    172 	for (i = 0; *s && *s != '&'; s++) {
    173 		if (i + 3 >= bufsiz)
    174 			return -1;
    175 		switch (*s) {
    176 		case '%':
    177 			if (!isxdigit(*(s+1)) || !isxdigit(*(s+2)))
    178 				return -1;
    179 			buf[i++] = hexdigit(*(s+1)) * 16 + hexdigit(*(s+2));
    180 			s += 2;
    181 			break;
    182 		case '+':
    183 			buf[i++] = ' ';
    184 			break;
    185 		default:
    186 			buf[i++] = *s;
    187 			break;
    188 		}
    189 	}
    190 	buf[i] = '\0';
    191 
    192 	return i;
    193 }
    194 
    195 void
    196 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
    197         const char *v, size_t vl)
    198 {
    199 	if (!strcmp(t, "div") && !strcmp(a, "class") && strstr(v, "results_links"))
    200 		isresult = 1;
    201 
    202 	if (!isresult)
    203 		return;
    204 
    205 	if (!strcmp(t, "h2") && !strcmp(a, "class") && strstr(v, "result__title"))
    206 		istitle = 1;
    207 	if (!strcmp(t, "a") && !strcmp(a, "class") && strstr(v, "result__snippet"))
    208 		isdescription = 1;
    209 	if (!strcmp(t, "a") && !strcmp(a, "class") && strstr(v, "result__url"))
    210 		isurl = 1;
    211 	if (isurl && !strcmp(t, "a") && !strcmp(a, "href")) {
    212 		strlcpy(result.url, v, sizeof(result.url));
    213 	}
    214 }
    215 
    216 void
    217 xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
    218               const char *v, size_t vl)
    219 {
    220 	char buf[16];
    221 	ssize_t len;
    222 
    223 	if (!isresult || !istitle || !isdescription || !isurl)
    224 		return;
    225 
    226 	if ((len = xml_entitytostr(v, buf, sizeof(buf))) > 0)
    227 		xmlattr(x, t, tl, a, al, buf, (size_t)len);
    228 	else
    229 		xmlattr(x, t, tl, a, al, v, vl);
    230 }
    231 
    232 void
    233 xmldata(XMLParser *x, const char *d, size_t dl)
    234 {
    235 	if (istitle)
    236 		strlcat(result.title, d, sizeof(result.title));
    237 	if (isdescription)
    238 		strlcat(result.description, d, sizeof(result.description));
    239 }
    240 
    241 void
    242 xmlcdata(XMLParser *x, const char *d, size_t dl)
    243 {
    244 	xmldata(x, d, dl);
    245 }
    246 
    247 void
    248 xmldataentity(XMLParser *x, const char *d, size_t dl)
    249 {
    250 	char buf[16];
    251 	ssize_t len;
    252 
    253 	if (!isresult || !istitle || !isdescription || !isurl)
    254 		return;
    255 
    256 	if ((len = xml_entitytostr(d, buf, sizeof(buf))) > 0)
    257 		xmldata(x, buf, (size_t)len);
    258 	else
    259 		xmldata(x, d, dl);
    260 }
    261 
    262 void
    263 xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
    264 {
    265 	if (isresult) {
    266 		if (isdescription) {
    267 			if (!strcmp(t, "b"))
    268 				strlcat(result.description, "*", sizeof(result.description));
    269 		}
    270 
    271 		if (istitle && !strcmp(t, "h2"))
    272 			istitle = 0;
    273 		if (isdescription && !strcmp(t, "a"))
    274 			isdescription = 0;
    275 		if (isurl && !strcmp(t, "a"))
    276 			isurl = 0;
    277 		if (!strcmp(t, "div")) {
    278 			/* decode url and remove "tracking"/usage part via DDG */
    279 			if (!strncmp(result.url, "uddg=", sizeof("uddg=") - 1)) {
    280 				if (decodeparam(result.urldecoded, sizeof(result.urldecoded),
    281 				    result.url + sizeof("uddg=") - 1) == -1)
    282 					result.urldecoded[0] = '\0';
    283 			}
    284 
    285 			sanitize(result.title, strlen(result.title));
    286 			sanitize(result.urldecoded, strlen(result.urldecoded));
    287 			// XXX: for gopher only.
    288 			sanitize(result.description, strlen(result.description));
    289 
    290 //#define GOPHER
    291 #ifdef GOPHER
    292 			// TODO: encode for gphlink, see stagit-gopher */
    293 			fputs("[h|", stdout);
    294 			fputs(result.title, stdout);
    295 			fputs("|", stdout);
    296 			fputs(result.urldecoded, stdout);
    297 			fputs("|server|port]\n", stdout);
    298 
    299 			fputs("[h|", stdout);
    300 			fputs(result.urldecoded, stdout);
    301 			fputs("|", stdout);
    302 			fputs(result.urldecoded, stdout);
    303 			fputs("|server|port]\n", stdout);
    304 
    305 			fputs("[h|", stdout);
    306 			printutf8pad(result.description, 79, ' ');
    307 			fputs("|", stdout);
    308 			fputs(result.urldecoded, stdout);
    309 			fputs("|server|port]\n", stdout);
    310 
    311 			fputs("\n\n", stdout);
    312 #else
    313 			printutf8pad(result.title, 70, ' ');
    314 			fputs("  ", stdout);
    315 			puts(result.urldecoded);
    316 #endif
    317 
    318 			isresult = istitle = isdescription = isurl = 0;
    319 			memset(&result, 0, sizeof(result));
    320 		}
    321 	}
    322 }
    323 
    324 void
    325 xmltagstart(XMLParser *x, const char *t, size_t tl)
    326 {
    327 	if (isdescription && !strcmp(t, "b"))
    328 		strlcat(result.description, "*", sizeof(result.description));
    329 
    330 }
    331 
    332 int
    333 main(void)
    334 {
    335 	setlocale(LC_CTYPE, "");
    336 
    337 	if (pledge("stdio", NULL) == -1)
    338 		err(1, "pledge");
    339 
    340 	x.xmlattr = xmlattr;
    341 	x.xmlattrentity = xmlattrentity;
    342 	x.xmlcdata = xmlcdata;
    343 	x.xmldata = xmldata;
    344 	x.xmldataentity = xmldataentity;
    345 	x.xmltagend = xmltagend;
    346 	x.xmltagstart = xmltagstart;
    347 
    348 	x.getnext = getchar;
    349 
    350 	xml_parse(&x);
    351 
    352 	return 0;
    353 }