idiotbox

youtube scraping and interfaces: CGI, Gopher, CLI
Log | Files | Refs | README | LICENSE

commit d10243c22f062c08b10bf2e59adefda40e1293fb
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 29 Dec 2018 16:09:33 +0100

initial repo

Diffstat:
ALICENSE | 15+++++++++++++++
AMakefile | 19+++++++++++++++++++
AREADME | 44++++++++++++++++++++++++++++++++++++++++++++
ATODO | 16++++++++++++++++
Acli.c | 221+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Amain.c | 464+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Axml.c | 474+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Axml.h | 40++++++++++++++++++++++++++++++++++++++++
Ayoutube.c | 535+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ayoutube.h | 17+++++++++++++++++
10 files changed, 1845 insertions(+), 0 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2018 Hiltjo Posthuma <hiltjo@codemadness.org> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/Makefile b/Makefile @@ -0,0 +1,19 @@ +build: clean + cc -c xml.c ${CFLAGS} -Wall + cc -c youtube.c ${CFLAGS} -Wall + # UIs + # HTML + cc -c main.c ${CFLAGS} -Wall + # CLI + cc -c cli.c ${CFLAGS} -Wall + # Link HTML CGI (static) + cc -o main xml.o youtube.o main.o \ + ${LDFLAGS} \ + -ltls -lssl -lcrypto -static + # Link CLI UI + cc -o cli xml.o youtube.o cli.o \ + ${LDFLAGS} \ + -ltls + +clean: + rm -f main cli *.o diff --git a/README b/README @@ -0,0 +1,44 @@ +Dependencies: +------------- + +- C compiler. +- LibreSSL + libtls. + + +Compile +------- + +- make +- doas make install + + +Install HTTP CGI +---------------- + +Nginx + slowcgi example: + + location /idiotbox/css/.* { + root /home/www/domains/www.codemadness.org/htdocs/idiotbox/css; + } + + location ~ ^/idiotbox(/|/\?.*)$ { + include /etc/nginx/fastcgi_params; + fastcgi_pass unix:/run/slowcgi.sock; + fastcgi_param SCRIPT_FILENAME /cgi-bin/idiotbox; + fastcgi_param SCRIPT_NAME /cgi-bin/idiotbox; + fastcgi_param REQUEST_URI /cgi-bin/idiotbox; + } + +httpd + slowcgi example: + + location match "/idiotbox" { + root "/cgi-bin/idiotbox.cgi" + fastcgi + } + + +When using a chroot make sure to copy /etc/resolv.conf and /etc/ssl/cert.pem. + +To test from the command-line you can do: + + QUERY_STRING="q=funny+cat+video" ./main | sed 1,2d | lynx -stdin diff --git a/TODO b/TODO @@ -0,0 +1,16 @@ +- decodeparam, getparam etc: cast ctype functions to (unsigned char). + +- order by views does not work in searching channel. +- pagination does not work in searching in channel. +- searching in channel works, but not search in user. + +? some way to show duration for playlists? +- show published date in a consistent way? ("non-human friendly": YYYY-mm-dd HH:MM:SS TZ) + +- separate code parsing and views for frontend. +- frontends: + - CGI HTTP + - CGI gopher + - dmenu / console (TSV output?) (no HTTP client, just parsing). + +- test/tweak Accept-Language header. diff --git a/cli.c b/cli.c @@ -0,0 +1,221 @@ +#include <sys/socket.h> +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <netdb.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "youtube.h" + +#ifndef __OpenBSD__ +#define pledge(p1,p2) 0 +#define unveil(p1,p2) 0 +#endif + +#ifndef TLS_CA_CERT_FILE +#define TLS_CA_CERT_FILE "/etc/ssl/cert.pem" +#endif + +/* TODO: escape control-characters etc */ +#define OUT(s) (fputs((s), stdout)) + +struct video *videos; +static int nvideos; + +void +die(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + exit(1); +} + +int +hexdigit(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + + return 0; +} + +/* decode until NUL separator or end of "key". */ +int +decodeparam(char *buf, size_t bufsiz, const char *s) +{ + size_t i; + + if (!bufsiz) + return -1; + + for (i = 0; *s && *s != '&'; s++) { + if (i + 3 >= bufsiz) + return -1; + switch (*s) { + case '%': + if (!isxdigit(*(s+1)) || !isxdigit(*(s+2))) + return -1; + buf[i++] = hexdigit(*(s+1)) * 16 + hexdigit(*(s+2)); + s += 2; + break; + case '+': + buf[i++] = ' '; + break; + default: + buf[i++] = *s; + break; + } + } + buf[i] = '\0'; + + return i; +} + +char * +getparam(const char *query, const char *s) +{ + const char *p, *last = NULL; + size_t len; + + len = strlen(s); + for (p = query; (p = strstr(p, s)); p += len) { + if (p[len] == '=' && (p == query || p[-1] == '&' || p[-1] == '?')) + last = p + len + 1; + } + + return (char *)last; +} + +int +render(void) +{ + int i; + + if (pledge("stdio", NULL) == -1) { + fprintf(stderr, "pledge: %s\n", strerror(errno)); + exit(1); + } + + for (i = 0; i < nvideos; i++) { + /* TODO: better printing of other types */ + switch (videos[i].linktype) { + case Channel: + OUT("[Channel] "); + OUT(videos[i].channeltitle); + break; + case Movie: + OUT("[Movie] "); + OUT(videos[i].title); + break; + case Playlist: + OUT("[Playlist] "); + OUT(videos[i].title); + break; + default: + OUT(videos[i].title); + break; + } + OUT("\n"); + + if (videos[i].id[0]) { + OUT("URL: https://www.youtube.com/embed/"); + OUT(videos[i].id); + OUT("\n"); + } + + if (videos[i].channelid[0] || videos[i].userid[0]) { + OUT("Atom feed: https://www.youtube.com/feeds/videos.xml?"); + if (videos[i].channelid[0]) { + OUT("channel_id="); + OUT(videos[i].channelid); + } else if (videos[i].userid[0]) { + OUT("user="); + OUT(videos[i].userid); + } + OUT("\n"); + } + + if (videos[i].channelid[0] || videos[i].userid[0]) { + OUT("Channel title: "); + OUT(videos[i].channeltitle); + OUT("\n"); + if (videos[i].channelid[0]) { + OUT("Channelid: "); + OUT(videos[i].channelid); + OUT("\n"); + } else if (videos[i].userid[0]) { + OUT("Userid: "); + OUT(videos[i].userid); + OUT("\n"); + } + } + if (videos[i].publishedat[0]) { + OUT("Published: "); + OUT(videos[i].publishedat); + OUT("\n"); + } + if (videos[i].viewcount[0]) { + OUT("Viewcount: "); + OUT(videos[i].viewcount); + OUT("\n"); + } + if (videos[i].duration[0]) { + OUT("Duration: " ); + OUT(videos[i].duration); + OUT("\n"); + } + OUT("===\n"); + } + + return 0; +} + +static void +usage(const char *argv0) +{ + fprintf(stderr, "usage: %s <keywords>\n", argv0); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + if (pledge("stdio dns inet rpath unveil", NULL) == -1) { + fprintf(stderr, "pledge: %s\n", strerror(errno)); + exit(1); + } + if (unveil(TLS_CA_CERT_FILE, "r") == -1) { + fprintf(stderr, "unveil: %s\n", strerror(errno)); + exit(1); + } + if (unveil(NULL, NULL) == -1) { + fprintf(stderr, "unveil: %s\n", strerror(errno)); + exit(1); + } + + if (argc < 2 || !argv[1][0]) + usage(argv[0]); + + videos = youtube_search(&nvideos, argv[1], "", "", "", "relevance"); + if (!videos || nvideos <= 0) { + OUT("No videos found\n"); + exit(1); + } + + render(); + + return 0; +} diff --git a/main.c b/main.c @@ -0,0 +1,464 @@ +#include <sys/socket.h> +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <netdb.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "youtube.h" + +#ifndef __OpenBSD__ +#define pledge(p1,p2) 0 +#define unveil(p1,p2) 0 +#endif + +#ifndef TLS_CA_CERT_FILE +#define TLS_CA_CERT_FILE "/etc/ssl/cert.pem" +#endif + +#define OUT(s) (fputs((s), stdout)) + +extern char **environ; + +struct video *videos; +static int curpage = 1, nvideos; + +/* CGI parameters */ +static char rawsearch[4096], search[4096], mode[16], order[16], page[64]; +static char chan[1024], user[1024]; + +void +die(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + exit(1); +} + +int +hexdigit(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + + return 0; +} + +/* decode until NUL separator or end of "key". */ +int +decodeparam(char *buf, size_t bufsiz, const char *s) +{ + size_t i; + + if (!bufsiz) + return -1; + + for (i = 0; *s && *s != '&'; s++) { + if (i + 3 >= bufsiz) + return -1; + switch (*s) { + case '%': + if (!isxdigit(*(s+1)) || !isxdigit(*(s+2))) + return -1; + buf[i++] = hexdigit(*(s+1)) * 16 + hexdigit(*(s+2)); + s += 2; + break; + case '+': + buf[i++] = ' '; + break; + default: + buf[i++] = *s; + break; + } + } + buf[i] = '\0'; + + return i; +} + +char * +getparam(const char *query, const char *s) +{ + const char *p, *last = NULL; + size_t len; + + len = strlen(s); + for (p = query; (p = strstr(p, s)); p += len) { + if (p[len] == '=' && (p == query || p[-1] == '&' || p[-1] == '?')) + last = p + len + 1; + } + + return (char *)last; +} + +/* Escape characters below as HTML 2.0 / XML 1.0. */ +void +xmlencode(const char *s) +{ + for (; *s; s++) { + switch(*s) { + case '<': OUT("&lt;"); break; + case '>': OUT("&gt;"); break; + case '\'': OUT("&#39;"); break; + case '&': OUT("&amp;"); break; + case '"': OUT("&quot;"); break; + default: putchar(*s); + } + } +} + +void +parsecgi(void) +{ + char *query, *p; + size_t len; + + if (!(query = getenv("QUERY_STRING"))) + query = ""; + + /* channel: search in channel */ + if ((p = getparam(query, "chan"))) { + if (decodeparam(chan, sizeof(chan), p) == -1) + chan[0] = '\0'; + } + /* user: search in user */ + if ((p = getparam(query, "user"))) { + if (decodeparam(user, sizeof(user), p) == -1) + user[0] = '\0'; + } + if (!strcmp(chan, "Search all") || !strcmp(user, "Search all")) { + chan[0] = '\0'; + user[0] = '\0'; + } + + /* order */ + if ((p = getparam(query, "o"))) { + if (decodeparam(order, sizeof(order), p) == -1 || + (strcmp(order, "date") && + strcmp(order, "relevance") && + strcmp(order, "views"))) + order[0] = '\0'; + } + if (!order[0]) + snprintf(order, sizeof(order), chan[0] || user[0] ? "date" : "relevance"); + + /* page */ + if ((p = getparam(query, "page"))) { + if (decodeparam(page, sizeof(page), p) == -1) + page[0] = '\0'; + /* check if it's a number > 0 and < 100 */ + errno = 0; + curpage = strtol(page, NULL, 10); + if (errno || curpage < 0 || curpage > 100) { + curpage = 1; + page[0] = '\0'; + } + } + + /* mode */ + if ((p = getparam(query, "m"))) { + if (decodeparam(mode, sizeof(mode), p) != -1) { + /* fixup first character (label) for matching */ + if (mode[0]) + mode[0] = tolower((unsigned char)mode[0]); + /* allowed themes */ + if (strcmp(mode, "light") && + strcmp(mode, "dark") && + strcmp(mode, "pink") && + strcmp(mode, "templeos")) + mode[0] = '\0'; + } + } + if (!mode[0]) + snprintf(mode, sizeof(mode), "light"); + + /* search */ + if ((p = getparam(query, "q"))) { + if ((len = strcspn(p, "&")) && len + 1 < sizeof(rawsearch)) { + memcpy(rawsearch, p, len); + rawsearch[len] = '\0'; + } + + if (decodeparam(search, sizeof(search), p) == -1) { + OUT("Status: 401 Bad Request\r\n\r\n"); + exit(1); + } + } +} + +int +render(void) +{ + char tmp[64]; + int i; + + if (pledge("stdio", NULL) == -1) { + OUT("Status: 500 Internal Server Error\r\n\r\n"); + exit(1); + } + + OUT( + "Content-Type: text/html; charset=utf-8\r\n\r\n" + "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n" + "<title>Search: \""); + xmlencode(search); + OUT("\""); + if (nvideos) { + if (videos[0].channelid[0]) + printf(" in %s", videos[0].channeltitle); + else if (videos[0].userid[0]) + printf(" in %s", videos[0].userid); + } + printf(" sorted by %s</title>\n", order); + OUT( + "<link rel=\"stylesheet\" href=\"css/"); + xmlencode(mode); + OUT( + ".css\" type=\"text/css\" media=\"screen\" />\n" + "<link rel=\"icon\" type=\"image/png\" href=\"/favicon.png\" />\n" + "<meta content=\"width=device-width\" name=\"viewport\" />\n" + "</head>\n" + "<body class=\"search\">\n" + "<form method=\"get\" action=\"\">\n"); + + OUT("<input type=\"hidden\" name=\"m\" value=\""); + xmlencode(mode); + OUT("\" />\n"); + if (chan[0]) { + OUT("<input type=\"hidden\" name=\"chan\" value=\""); + xmlencode(chan); + OUT("\" />\n"); + } + + OUT( + "<table class=\"search\" width=\"100%\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\">\n" + "<tr>\n" + " <td width=\"100%\" class=\"input\">\n" + " <input type=\"search\" name=\"q\" value=\""); + xmlencode(search); + OUT( + "\" placeholder=\"Search...\" size=\"72\" autofocus=\"autofocus\" class=\"search\" accesskey=\"f\" />\n" + " </td>\n" + " <td nowrap class=\"nowrap\">\n" + " <input type=\"submit\" value=\"Search\" class=\"button\"/>\n"); + + if (chan[0]) + OUT(" <input type=\"submit\" name=\"chan\" value=\"Search all\" title=\"Search globally and not in the selected channel\" accesskey=\"c\" />\n"); + + OUT( + " <select name=\"o\" title=\"Order by\" accesskey=\"o\">\n"); + printf(" <option value=\"date\"%s>Creation date</option>\n", !strcmp(order, "date") ? " selected=\"selected\"" : ""); + printf(" <option value=\"relevance\"%s>Relevance</option>\n", !strcmp(order, "relevance") ? " selected=\"selected\"" : ""); + printf(" <option value=\"views\"%s>Views</option>\n", !strcmp(order, "views") ? " selected=\"selected\"" : ""); + OUT( + " </select>\n" + " <label for=\"m\">Style: </label>\n"); + + if (!strcmp(mode, "light")) + OUT("\t\t<input type=\"submit\" name=\"m\" value=\"Dark\" title=\"Dark mode\" id=\"m\" accesskey=\"s\"/>\n"); + else + OUT("\t\t<input type=\"submit\" name=\"m\" value=\"Light\" title=\"Light mode\" id=\"m\" accesskey=\"s\"/>\n"); + + OUT( + " </td>\n" + "</tr>\n" + "</table>\n" + "</form>\n"); + + if (nvideos) { + OUT( + "<hr/>\n" + "<table class=\"videos\" width=\"100%\" border=\"0\" cellpadding=\"0\" cellspacing=\"0\">\n" + "<tbody>\n"); + + for (i = 0; i < nvideos; i++) { + OUT( + "<tr class=\"v\">\n" + " <td class=\"thumb\" width=\"120\" align=\"center\">\n" + " <a href=\"https://www.youtube.com/embed/"); + xmlencode(videos[i].id); + /* TODO: for channel show channel picture in some way? */ + OUT("\"><img src=\"https://i.ytimg.com/vi/"); + xmlencode(videos[i].id); + OUT( + "/default.jpg\" alt=\"\" height=\"90\" border=\"0\" /></a>\n" + " </td>\n" + " <td>\n" + " <span class=\"title\"><a href=\"https://www.youtube.com/embed/"); + xmlencode(videos[i].id); + printf("\" accesskey=\"%d\">", i); + + /* TODO: better printing of other types */ + switch (videos[i].linktype) { + case Channel: + OUT("[Channel] "); + xmlencode(videos[i].channeltitle); + break; + case Movie: + OUT("[Movie] "); + xmlencode(videos[i].title); + break; + case Playlist: + OUT("[Playlist] "); + xmlencode(videos[i].title); + break; + default: + xmlencode(videos[i].title); + break; + } + + OUT( + "</a></span><br/>\n" + " <span class=\"channel\">"); + + OUT("<a title=\"Search in "); + xmlencode(videos[i].channeltitle); + OUT("\" href=\"?"); + if (videos[i].channelid[0]) { + OUT("chan="); + xmlencode(videos[i].channelid); + } else if (videos[i].userid[0]) { + OUT("user="); + xmlencode(videos[i].userid); + } + OUT("&amp;m="); + xmlencode(mode); + OUT("\">"); + xmlencode(videos[i].channeltitle); + OUT("</a>"); + if (videos[i].channelid[0] || videos[i].userid[0]) { + OUT(" | <a title=\""); + xmlencode(videos[i].channeltitle); + OUT(" Atom feed\" href=\"https://www.youtube.com/feeds/videos.xml?"); + if (videos[i].channelid[0]) { + OUT("channel_id="); + xmlencode(videos[i].channelid); + } else if (videos[i].userid[0]) { + OUT("user="); + xmlencode(videos[i].userid); + } + OUT("\">Atom feed</a>"); + } + OUT("</span><br/>\n"); + if (videos[i].publishedat[0]) { + OUT(" <span class=\"publishedat\">Published: "); + OUT(videos[i].publishedat); + } + OUT( + "</span><br/>\n" + " <span class=\"stats\">"); + OUT(videos[i].viewcount); + OUT( + "</span><br/>\n" + " </td>\n" + " <td align=\"right\" class=\"a-r\">\n" + " <span class=\"duration\">"); + OUT(videos[i].duration); + OUT( + "</span>\n" + " </td>\n" + "</tr>\n" + "<tr class=\"hr\">\n" + " <td colspan=\"3\"><hr/></td>\n" + "</tr>\n"); + } + OUT("</tbody>\n"); + + OUT( + "<tfoot>\n" + "<tr>\n" + "\t<td align=\"left\" class=\"nowrap\" nowrap>\n"); + if (curpage > 0) { + OUT("\t\t<a href=\"?q="); + xmlencode(search); + OUT("&amp;page="); + snprintf(tmp, sizeof(tmp), "%d", curpage - 1); + xmlencode(tmp); + OUT("&amp;m="); + xmlencode(mode); + OUT("&amp;o="); + xmlencode(order); + if (chan[0]) { + OUT("&amp;chan="); + xmlencode(chan); + } + OUT("\" rel=\"prev\" accesskey=\"p\">&larr; prev</a>\n"); + } + OUT( + "\t</td>\n\t<td></td>\n" + "\t<td align=\"right\" class=\"a-r nowrap\" nowrap>\n"); + + OUT("\t\t<a href=\"?q="); + xmlencode(search); + OUT("&amp;page="); + snprintf(tmp, sizeof(tmp), "%d", curpage + 1); + xmlencode(tmp); + OUT("&amp;m="); + xmlencode(mode); + OUT("&amp;o="); + xmlencode(order); + if (chan[0]) { + OUT("&amp;chan="); + xmlencode(chan); + } + OUT("\" rel=\"next\" accesskey=\"n\">next &rarr;</a>\n"); + + OUT( + "\t</td>\n" + "</tr>\n" + "</tfoot>\n"); + + OUT("</table>\n"); + } + + OUT("</body>\n</html>\n"); + + return 0; +} + +int +main(void) +{ + if (pledge("stdio dns inet rpath unveil", NULL) == -1) { + OUT("Status: 500 Internal Server Error\r\n\r\n"); + exit(1); + } + if (unveil(TLS_CA_CERT_FILE, "r") == -1) { + OUT("Status: 500 Internal Server Error\r\n\r\n"); + exit(1); + } + if (unveil(NULL, NULL) == -1) { + OUT("Status: 500 Internal Server Error\r\n\r\n"); + exit(1); + } + + parsecgi(); + + if (!rawsearch[0] && !chan[0] && !user[0]) + goto show; + + videos = youtube_search(&nvideos, rawsearch, chan, user, page, order); + if (!videos || nvideos <= 0) { + OUT("Status: 500 Internal Server Error\r\n\r\n"); + exit(1); + } + +show: + render(); + + return 0; +} diff --git a/xml.c b/xml.c @@ -0,0 +1,474 @@ +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xml.h" + +static void +xml_parseattrs(XMLParser *x) +{ + size_t namelen = 0, valuelen; + int c, endsep, endname = 0, valuestart = 0; + + while ((c = x->getnext()) != EOF) { + if (isspace(c)) { + if (namelen) + endname = 1; + continue; + } else if (c == '?') + ; /* ignore */ + else if (c == '=') { + x->name[namelen] = '\0'; + valuestart = 1; + endname = 1; + } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) { + /* attribute without value */ + x->name[namelen] = '\0'; + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + endname = 0; + x->name[0] = c; + namelen = 1; + } else if (namelen && valuestart) { + /* attribute with value */ + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + + valuelen = 0; + if (c == '\'' || c == '"') { + endsep = c; + } else { + endsep = ' '; /* isspace() */ + goto startvalue; + } + + while ((c = x->getnext()) != EOF) { +startvalue: + if (c == '&') { /* entities */ + x->data[valuelen] = '\0'; + /* call data function with data before entity if there is data */ + if (valuelen && x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + while ((c = x->getnext()) != EOF) { + if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) + break; + if (valuelen < sizeof(x->data) - 1) + x->data[valuelen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + break; + } + if (c == ';') { + x->data[valuelen] = '\0'; + if (x->xmlattrentity) + x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + valuelen = 0; + break; + } + } + } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) { + if (valuelen < sizeof(x->data) - 1) { + x->data[valuelen++] = c; + } else { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + } + } + if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + break; + } + } + namelen = endname = valuestart = 0; + } else if (namelen < sizeof(x->name) - 1) { + x->name[namelen++] = c; + } + if (c == '>') { + break; + } else if (c == '/') { + x->isshorttag = 1; + x->name[0] = '\0'; + namelen = 0; + } + } +} + +static void +xml_parsecomment(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcommentstart) + x->xmlcommentstart(x); + while ((c = x->getnext()) != EOF) { + if (c == '-' || c == '>') { + if (x->xmlcomment) { + x->data[datalen] = '\0'; + x->xmlcomment(x, x->data, datalen); + datalen = 0; + } + } + + if (c == '-') { + if (++i > 2) { + if (x->xmlcomment) + for (; i > 2; i--) + x->xmlcomment(x, "-", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcommentend) + x->xmlcommentend(x); + return; + } else if (i) { + if (x->xmlcomment) { + for (; i > 0; i--) + x->xmlcomment(x, "-", 1); + } + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcomment) + x->xmlcomment(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static void +xml_parsecdata(XMLParser *x) +{ + size_t datalen = 0, i = 0; + int c; + + if (x->xmlcdatastart) + x->xmlcdatastart(x); + while ((c = x->getnext()) != EOF) { + if (c == ']' || c == '>') { + if (x->xmlcdata) { + x->data[datalen] = '\0'; + x->xmlcdata(x, x->data, datalen); + datalen = 0; + } + } + + if (c == ']') { + if (++i > 2) { + if (x->xmlcdata) + for (; i > 2; i--) + x->xmlcdata(x, "]", 1); + i = 2; + } + continue; + } else if (c == '>' && i == 2) { + if (x->xmlcdataend) + x->xmlcdataend(x); + return; + } else if (i) { + if (x->xmlcdata) + for (; i > 0; i--) + x->xmlcdata(x, "]", 1); + i = 0; + } + + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcdata) + x->xmlcdata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static int +codepointtoutf8(long r, char *s) +{ + if (r == 0) { + return 0; /* NUL byte */ + } else if (r <= 0x7F) { + /* 1 byte: 0aaaaaaa */ + s[0] = r; + return 1; + } else if (r <= 0x07FF) { + /* 2 bytes: 00000aaa aabbbbbb */ + s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ + s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ + return 2; + } else if (r <= 0xFFFF) { + /* 3 bytes: aaaabbbb bbcccccc */ + s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ + s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ + s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ + return 3; + } else { + /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ + s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ + s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ + s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ + s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ + return 4; + } +} + +static int +namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ + static const struct { + char *entity; + int c; + } entities[] = { + { "&amp;", '&' }, + { "&lt;", '<' }, + { "&gt;", '>' }, + { "&apos;", '\'' }, + { "&quot;", '"' }, + { "&AMP;", '&' }, + { "&LT;", '<' }, + { "&GT;", '>' }, + { "&APOS;", '\'' }, + { "&QUOT;", '"' } + }; + size_t i; + + /* buffer is too small */ + if (bufsiz < 2) + return -1; + + /* doesn't start with &: can't match */ + if (*e != '&') + return 0; + + for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { + if (!strcmp(e, entities[i].entity)) { + buf[0] = entities[i].c; + buf[1] = '\0'; + return 1; + } + } + return 0; +} + +static int +numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ + long l; + int len; + char *end; + + /* buffer is too small */ + if (bufsiz < 5) + return -1; + + /* not a numeric entity */ + if (e[0] != '&' || e[1] != '#') + return 0; + + /* e[1] == '#', numeric / hexadecimal entity */ + e += 2; /* skip "&#" */ + errno = 0; + /* hex (16) or decimal (10) */ + if (*e == 'x') + l = strtoul(e + 1, &end, 16); + else + l = strtoul(e, &end, 10); + /* invalid value or not a well-formed entity or too high codepoint */ + if (errno || *end != ';' || l > 0x10FFFF) + return 0; + len = codepointtoutf8(l, buf); + buf[len] = '\0'; + + return len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string. */ +int +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ + /* buffer is too small */ + if (bufsiz < 5) + return -1; + /* doesn't start with & */ + if (e[0] != '&') + return 0; + /* named entity */ + if (e[1] != '#') + return namedentitytostr(e, buf, bufsiz); + else /* numeric entity */ + return numericentitytostr(e, buf, bufsiz); +} + +void +xml_parse(XMLParser *x) +{ + size_t datalen, tagdatalen; + int c, isend; + + if (!x->getnext) + return; + while ((c = x->getnext()) != EOF && c != '<') + ; /* skip until < */ + + while (c != EOF) { + if (c == '<') { /* parse tag */ + if ((c = x->getnext()) == EOF) + return; + + if (c == '!') { /* cdata and comments */ + for (tagdatalen = 0; (c = x->getnext()) != EOF;) { + /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */ + if (tagdatalen <= sizeof("[CDATA[") - 1) + x->data[tagdatalen++] = c; + if (c == '>') + break; + else if (c == '-' && tagdatalen == sizeof("--") - 1 && + (x->data[0] == '-')) { + xml_parsecomment(x); + break; + } else if (c == '[') { + if (tagdatalen == sizeof("[CDATA[") - 1 && + !strncmp(x->data, "[CDATA[", tagdatalen)) { + xml_parsecdata(x); + break; + } + } + } + } else { + /* normal tag (open, short open, close), processing instruction. */ + x->tag[0] = c; + x->taglen = 1; + x->isshorttag = isend = 0; + + /* treat processing instruction as shorttag, don't strip "?" prefix. */ + if (c == '?') { + x->isshorttag = 1; + } else if (c == '/') { + if ((c = x->getnext()) == EOF) + return; + x->tag[0] = c; + isend = 1; + } + + while ((c = x->getnext()) != EOF) { + if (c == '/') + x->isshorttag = 1; /* short tag */ + else if (c == '>' || isspace(c)) { + x->tag[x->taglen] = '\0'; + if (isend) { /* end tag, starts with </ */ + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } else { + /* start tag */ + if (x->xmltagstart) + x->xmltagstart(x, x->tag, x->taglen); + if (isspace(c)) + xml_parseattrs(x); + if (x->xmltagstartparsed) + x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); + } + /* call tagend for shortform or processing instruction */ + if (x->isshorttag) { + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } + break; + } else if (x->taglen < sizeof(x->tag) - 1) + x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ + } + } + } else { + /* parse tag data */ + datalen = 0; + if (x->xmldatastart) + x->xmldatastart(x); + while ((c = x->getnext()) != EOF) { + if (c == '&') { + if (datalen) { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + } + x->data[0] = c; + datalen = 1; + while ((c = x->getnext()) != EOF) { + if (c == '<') + break; + if (datalen < sizeof(x->data) - 1) + x->data[datalen++] = c; + else { + /* entity too long for buffer, handle as normal data */ + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + break; + } + if (c == ';') { + x->data[datalen] = '\0'; + if (x->xmldataentity) + x->xmldataentity(x, x->data, datalen); + datalen = 0; + break; + } + } + } else if (c != '<') { + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } + if (c == '<') { + x->data[datalen] = '\0'; + if (x->xmldata && datalen) + x->xmldata(x, x->data, datalen); + if (x->xmldataend) + x->xmldataend(x); + break; + } + } + } + } +} diff --git a/xml.h b/xml.h @@ -0,0 +1,40 @@ +typedef struct xmlparser { + /* handlers */ + void (*xmlattr)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlattrend)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrstart)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrentity)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlcdatastart)(struct xmlparser *); + void (*xmlcdata)(struct xmlparser *, const char *, size_t); + void (*xmlcdataend)(struct xmlparser *); + void (*xmlcommentstart)(struct xmlparser *); + void (*xmlcomment)(struct xmlparser *, const char *, size_t); + void (*xmlcommentend)(struct xmlparser *); + void (*xmldata)(struct xmlparser *, const char *, size_t); + void (*xmldataend)(struct xmlparser *); + void (*xmldataentity)(struct xmlparser *, const char *, size_t); + void (*xmldatastart)(struct xmlparser *); + void (*xmltagend)(struct xmlparser *, const char *, size_t, int); + void (*xmltagstart)(struct xmlparser *, const char *, size_t); + void (*xmltagstartparsed)(struct xmlparser *, const char *, + size_t, int); + + int (*getnext)(void); + + /* current tag */ + char tag[1024]; + size_t taglen; + /* current tag is in short form ? <tag /> */ + int isshorttag; + /* current attribute name */ + char name[1024]; + /* data buffer used for tag data, cdata and attribute data */ + char data[BUFSIZ]; +} XMLParser; + +int xml_entitytostr(const char *, char *, size_t); +void xml_parse(XMLParser *); diff --git a/youtube.c b/youtube.c @@ -0,0 +1,535 @@ +#include <sys/socket.h> +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <netdb.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <tls.h> + +#include "youtube.h" +#include "xml.h" + +#define READ_BUF_SIZ 16384 /* read buffer in bytes */ +#define MAX_RESPONSETIMEOUT 10 /* timeout in seconds */ +#define MAX_RESPONSESIZ 500000 /* max download size in bytes */ + +#define STRP(s) s,sizeof(s)-1 + +static const int maxvideos = 30; +static struct video videos[maxvideos + 1]; +static int nvideos; + +/* temporary variables to copy for states */ +static char id[256], userid[256]; + +/* states */ +static int metainfocount; +static enum ItemState { + None = 0, + Item = 1, Pager = 2, + Metainfo = 4, Title = 8, User = 16, Videotime = 32, +} state; + +/* data buffers, size and offset used for parsing XML, see getnext() */ +static char *responsedata; +static size_t responsesize; +static size_t responseoff; + +/* ? TODO: don't die in youtube.c ? */ +static void +die(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + exit(1); +} + +static int +hexdigit(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + + return 0; +} + +/* decode until NUL separator or end of "key". */ +static int +decodeparam(char *buf, size_t bufsiz, const char *s) +{ + size_t i; + + if (!bufsiz) + return -1; + + for (i = 0; *s && *s != '&'; s++) { + if (i + 3 >= bufsiz) + return -1; + switch (*s) { + case '%': + if (!isxdigit(*(s+1)) || !isxdigit(*(s+2))) + return -1; + buf[i++] = hexdigit(*(s+1)) * 16 + hexdigit(*(s+2)); + s += 2; + break; + case '+': + buf[i++] = ' '; + break; + default: + buf[i++] = *s; + break; + } + } + buf[i] = '\0'; + + return i; +} + +static char * +getparam(const char *query, const char *s) +{ + const char *p, *last = NULL; + size_t len; + + len = strlen(s); + for (p = query; (p = strstr(p, s)); p += len) { + if (p[len] == '=' && (p == query || p[-1] == '&' || p[-1] == '?')) + last = p + len + 1; + } + + return (char *)last; +} + +static int +isclassmatch(const char *classes, const char *clss, size_t len) +{ + const char *p; + + if (!(p = strstr(classes, clss))) + return 0; + return (p == classes || isspace((unsigned char)p[-1])) && + (isspace((unsigned char)p[len]) || !p[len]); +} + +/* XML/HTML entity conversion */ +static const char * +entitytostr(const char *s) +{ + static char buf[16]; + ssize_t len; + + if ((len = xml_entitytostr(s, buf, sizeof(buf))) > 0) + return buf; + + return s; +} + +void +xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + /* grouped channel index, used for channelid and channel title */ + static int grouped = -1; + + if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("search-pager"))) { + /* last video */ + if (videos[nvideos].linktype && nvideos < maxvideos) { + if (grouped != -1 && !videos[nvideos].channelid[0]) { + strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid)); + strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle)); + } + nvideos++; + } + state &= ~Item; + state |= Pager; + } + + if (nvideos >= maxvideos) + return; + + if (!strcmp(t, "div") && !strcmp(a, "class") && + isclassmatch(v, STRP("yt-lockup"))) { + state |= Item; + if (videos[nvideos].linktype) { + if (videos[nvideos].channelid[0] || videos[nvideos].userid[0] || + videos[nvideos].linktype != Video) + grouped = -1; + if (videos[nvideos].linktype == Channel) + grouped = nvideos; + if (grouped != -1 && !videos[nvideos].channelid[0]) { + strlcpy(videos[nvideos].channelid, videos[grouped].channelid, sizeof(videos[nvideos].channelid)); + strlcpy(videos[nvideos].channeltitle, videos[grouped].channeltitle, sizeof(videos[nvideos].channeltitle)); + } + nvideos++; + } + if (strstr(v, " yt-lockup-channel ")) + videos[nvideos].linktype = Channel; + else if (strstr(v, "yt-lockup-movie-")) + videos[nvideos].linktype = Movie; + else if (strstr(v, " yt-lockup-playlist ")) + videos[nvideos].linktype = Playlist; + if (strstr(v, " yt-lockup-video ")) + videos[nvideos].linktype = Video; + } + if (!(state & Item)) + return; + + if (!strcmp(t, "span") && !strcmp(a, "class") && isclassmatch(v, STRP("video-time"))) + state |= Videotime; + if (!strcmp(t, "ul") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-meta-info"))) { + state |= Metainfo; + metainfocount = 0; + } + if (!strcmp(t, "h3") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-title"))) + state |= Title; + if (!strcmp(t, "div") && !strcmp(a, "class") && isclassmatch(v, STRP("yt-lockup-byline"))) + state |= User; + + if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "title")) { + if (videos[nvideos].linktype == Channel) + strlcat(videos[nvideos].channeltitle, v, sizeof(videos[nvideos].channeltitle)); + else + strlcat(videos[nvideos].title, v, sizeof(videos[nvideos].title)); + } + + if ((state & Title) && !strcmp(t, "a") && !strcmp(a, "href")) + strlcat(id, v, sizeof(id)); + + if (!strcmp(t, "button") && !strcmp(a, "data-channel-external-id")) + strlcat(videos[nvideos].channelid, v, sizeof(videos[nvideos].channelid)); + + if ((state & User) && !strcmp(t, "a") && !strcmp(a, "href")) + strlcat(userid, v, sizeof(userid)); +} + +void +xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + const char *s; + + if (!(state & Pager) && nvideos >= maxvideos) + return; + + s = entitytostr(v); + xmlattr(x, t, tl, a, al, s, strlen(s)); +} + +void +xmldata(XMLParser *x, const char *d, size_t dl) +{ + if ((state & Pager)) + return; + + /* optimization: no need to process and must not process videos after this */ + if (!state || nvideos >= maxvideos) + return; + + /* use parsed link type for meta info since this metainfo differs per type like: + channel, playlist, video */ + if ((state & Metainfo)) { + switch (videos[nvideos].linktype) { + case Channel: + if (metainfocount == 1) + strlcat(videos[nvideos].channelvideos, d, sizeof(videos[nvideos].channelvideos)); + break; + default: + if (metainfocount == 1) + strlcat(videos[nvideos].publishedat, d, sizeof(videos[nvideos].publishedat)); + else if (metainfocount == 2) + strlcat(videos[nvideos].viewcount, d, sizeof(videos[nvideos].viewcount)); + } + } + if ((state & Videotime) && !strcmp(x->tag, "span")) + strlcat(videos[nvideos].duration, d, sizeof(videos[nvideos].duration)); + if ((state & User) && !strcmp(x->tag, "a")) + strlcat(videos[nvideos].channeltitle, d, sizeof(videos[nvideos].channeltitle)); +} + +void +xmldataentity(XMLParser *x, const char *d, size_t dl) +{ + const char *s; + + /* optimization: no need for entity conversion */ + if (!state || nvideos >= maxvideos) + return; + + s = entitytostr(d); + xmldata(x, s, strlen(s)); +} + +void +xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) +{ + char *p; + + if ((state & Metainfo) && !strcmp(t, "ul")) + state &= ~Metainfo; + if ((state & Title) && !strcmp(t, "h3")) { + state &= ~Title; + + if (nvideos >= maxvideos) + return; + + if (!strncmp(id, "/watch", sizeof("/watch") - 1)) { + if (!videos[nvideos].linktype) + videos[nvideos].linktype = Video; + if ((p = getparam(id, "v"))) { + if (decodeparam(videos[nvideos].id, sizeof(videos[nvideos].id), p) == -1) + videos[nvideos].id[0] = '\0'; + } + } + + id[0] = '\0'; + } + if ((state & User)) { + state &= ~User; + + if (nvideos >= maxvideos) + return; + + /* can be user or channel */ + if (!strncmp(userid, "/channel/", sizeof("/channel/") - 1)) { + strlcpy(videos[nvideos].channelid, + userid + sizeof("/channel/") - 1, + sizeof(videos[nvideos].channelid)); + } else if (!strncmp(userid, "/user/", sizeof("/user/") - 1)) { + strlcpy(videos[nvideos].userid, + userid + sizeof("/user/") - 1, + sizeof(videos[nvideos].userid)); + } + + userid[0] = '\0'; + } + if ((state & Videotime)) + state &= ~Videotime; +} + +void +xmltagstart(XMLParser *x, const char *t, size_t tl) +{ + if ((state & Metainfo) && !strcmp(t, "li")) + metainfocount++; +} + +char * +readtls(struct tls *t) +{ + char *buf; + size_t len = 0, size = 0; + ssize_t r; + + /* always allocate an empty buffer */ + if (!(buf = calloc(1, size + 1))) + die("calloc: %s\n", strerror(errno)); + + while (1) { + if (len + READ_BUF_SIZ + 1 > size) { + /* allocate size: common case is small textfiles */ + size += READ_BUF_SIZ; + if (!(buf = realloc(buf, size + 1))) + die("realloc: %s\n", strerror(errno)); + } + if ((r = tls_read(t, &buf[len], READ_BUF_SIZ)) <= 0) + break; + len += r; + buf[len] = '\0'; + if (len > MAX_RESPONSESIZ) + die("response is too big: > %zu bytes\n", MAX_RESPONSESIZ); + } + if (r < 0) + die("tls_read: %s\n", tls_error(t)); + + return buf; +} + +int +edial(const char *host, const char *port) +{ + struct addrinfo hints, *res, *res0; + int error, save_errno, s; + const char *cause = NULL; + struct timeval timeout; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICSERV; /* numeric port only */ + if ((error = getaddrinfo(host, port, &hints, &res0))) + die("%s: %s: %s:%s\n", __func__, gai_strerror(error), host, port); + s = -1; + for (res = res0; res; res = res->ai_next) { + s = socket(res->ai_family, res->ai_socktype, + res->ai_protocol); + if (s == -1) { + cause = "socket"; + continue; + } + + timeout.tv_sec = MAX_RESPONSETIMEOUT; + timeout.tv_usec = 0; + if (setsockopt(s, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)) == -1) + die("%s: setsockopt: %s\n", __func__, strerror(errno)); + + timeout.tv_sec = MAX_RESPONSETIMEOUT; + timeout.tv_usec = 0; + if (setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) == -1) + die("%s: setsockopt: %s\n", __func__, strerror(errno)); + + if (connect(s, res->ai_addr, res->ai_addrlen) == -1) { + cause = "connect"; + save_errno = errno; + close(s); + errno = save_errno; + s = -1; + continue; + } + break; + } + if (s == -1) + die("%s: %s: %s:%s\n", __func__, cause, host, port); + freeaddrinfo(res0); + + return s; +} + +char * +request(const char *path) +{ + struct tls *t; + const char *host = "www.youtube.com"; + char request[4096]; + char *data; + ssize_t w; + int fd; + + /* use HTTP/1.0, don't use HTTP/1.1 using ugly chunked-encoding */ + snprintf(request, sizeof(request), + "GET %s HTTP/1.0\r\n" + "Host: %s\r\n" + "Accept-Language: en-US\r\n" // TODO: better one. + "Connection: close\r\n" + "\r\n", path, host); + + if (tls_init() == -1) + die("tls_init\n"); + + if (!(t = tls_client())) + die("tls_client: %s\n", tls_error(t)); + + fd = edial(host, "443"); + + if (tls_connect_socket(t, fd, host) == -1) + die("tls_connect: %s\n", tls_error(t)); + + if ((w = tls_write(t, request, strlen(request))) < 0) + die("tls_write: %s\n", tls_error(t)); + + data = readtls(t); + + tls_close(t); + tls_free(t); + + return data; +} + +char * +request_search(const char *s, const char *chan, const char *user, + const char *page, const char *order) +{ + char path[4096]; + + /* when searching in channel or user but the search string is empty: + fake a search with a single space. */ + if ((chan[0] || user[0]) && !s[0]) + s = "+"; + + if (user[0]) + snprintf(path, sizeof(path), "/user/%s/search?query=%s", user, s); + else if (chan[0]) + snprintf(path, sizeof(path), "/channel/%s/search?query=%s", chan, s); + else + snprintf(path, sizeof(path), "/results?search_query=%s", s); + + if (page[0]) { + strlcat(path, "&page=", sizeof(path)); + strlcat(path, page, sizeof(path)); + } + + if (order[0]) { + strlcat(path, "&search_sort=", sizeof(path)); + if (!strcmp(order, "date")) + strlcat(path, "video_date_uploaded", sizeof(path)); + else if (!strcmp(order, "relevance")) + strlcat(path, "video_relevance", sizeof(path)); + else if (!strcmp(order, "views")) + strlcat(path, "video_view_count", sizeof(path)); + } + + /* check if request is too long */ + if (strlen(path) >= sizeof(path) - 1) + return NULL; + + return request(path); +} + +int +getnext(void) +{ + if (responseoff >= responsesize) + return EOF; + return responsedata[responseoff++]; +} + +/* TODO: ? keep search state in some separate context + like responsedata, responsesize. + */ +struct video * +youtube_search(int *nretvideos, + const char *rawsearch, const char *chan, const char *user, + const char *page, const char *order) +{ + XMLParser x = { 0 }; + char *data, *s; + + *nretvideos = -1; + + if (!(data = request_search(rawsearch, chan, user, page, order))) + return NULL; + if (!(s = strstr(data, "\r\n\r\n"))) + return NULL; /* invalid response */ + s += strlen("\r\n\r\n"); + + responsedata = s; + responsesize = strlen(s); + + x.xmlattr = xmlattr; + x.xmlattrentity = xmlattrentity; + x.xmldata = xmldata; + x.xmldataentity = xmldataentity; + x.xmltagend = xmltagend; + x.xmltagstart = xmltagstart; + + x.getnext = getnext; + + xml_parse(&x); + + *nretvideos = nvideos; + + return videos; +} diff --git a/youtube.h b/youtube.h @@ -0,0 +1,17 @@ +struct video { + enum LinkType { Unknown = 0, Channel, Movie, Playlist, Video } linktype; + char id[32]; + char title[1024]; + char channeltitle[1024]; + char channelid[256]; + char userid[256]; + char publishedat[32]; + char viewcount[32]; + char duration[32]; + char channelvideos[32]; /* for channel */ +}; + +struct video * +youtube_search(int *nretvideos, + const char *rawsearch, const char *chan, const char *user, + const char *page, const char *order);