toybox/toys/net/wget.c
<<
>>
Prefs
   1/* wget.c - Simple downloader to get the resource file from a HTTP server
   2 *
   3 * Copyright 2016 Lipi C.H. Lee <lipisoft@gmail.com>
   4 * Copyright 2021 Eric Molitor <eric@molitor.org>
   5 *
   6 * Relevant sources of information
   7 * -------------------------------
   8 * HTTP 1.1: https://www.rfc-editor.org/rfc/rfc7230
   9 * Chunked Encoding: https://www.rfc-editor.org/rfc/rfc7230#section-4.1
  10 * UTF-8 Encoded Header Values https://www.rfc-editor.org/rfc/rfc5987
  11 *
  12 * Test URLs
  13 * ---------
  14 * Chunked Encoding: https://jigsaw.w3.org/HTTP/ChunkedScript
  15 * Redirect 301: https://jigsaw.w3.org/HTTP/300/301.html
  16 * Redirect 302: https://jigsaw.w3.org/HTTP/300/302.html
  17 * TLS 1.0: https://tls-v1-0.badssl.com:1010/
  18 * TLS 1.1: https://tls-v1-1.badssl.com:1011/
  19 * TLS 1.2: https://tls-v1-2.badssl.com:1012/
  20 * TLS 1.3: https://tls13.1d.pw/
  21 * Transfer Encoding [gzip|deflate]: https://jigsaw.w3.org/HTTP/TE/bar.txt
  22 *
  23 *
  24 * todo: Add support for configurable TLS versions
  25 * todo: Add support for ftp
  26 * todo: Add support for Transfer Encoding (gzip|deflate)
  27 * todo: Add support for RFC5987
  28
  29USE_WGET(NEWTOY(wget, "<1>1(max-redirect)#<0=20d(debug)O(output-document):p(post-data):", TOYFLAG_USR|TOYFLAG_BIN))
  30
  31config WGET
  32  bool "wget"
  33  default y
  34  help
  35    usage: wget [OPTIONS]... [URL]
  36        --max-redirect          maximum redirections allowed
  37    -d, --debug                 print lots of debugging information
  38    -O, --output-document=FILE  specify output filename
  39    -p, --post-data=DATA        send data in body of POST request
  40
  41    examples:
  42      wget http://www.example.com
  43
  44config WGET_LIBTLS
  45  bool "Enable HTTPS support for wget via LibTLS"
  46  default n
  47  depends on WGET && !TOYBOX_LIBCRYPTO
  48  help
  49    Enable HTTPS support for wget by linking to LibTLS.
  50    Supports using libtls, libretls or libtls-bearssl.
  51
  52    Use TOYBOX_LIBCRYPTO to enable HTTPS support via OpenSSL.
  53*/
  54
  55#define FOR_wget
  56#include "toys.h"
  57
  58#if CFG_WGET_LIBTLS
  59#define WGET_SSL 1
  60#include <tls.h>
  61#elif CFG_TOYBOX_LIBCRYPTO
  62#define WGET_SSL 1
  63#include <openssl/crypto.h>
  64#include <openssl/ssl.h>
  65#include <openssl/err.h>
  66#else
  67#define WGET_SSL 0
  68#endif
  69#define HTTPS (WGET_SSL && TT.https)
  70
  71
  72GLOBALS(
  73  char *p, *O;
  74  long max_redirect;
  75
  76  int sock, https;
  77  char *url;
  78#if CFG_WGET_LIBTLS
  79  struct tls *tls;
  80#elif CFG_TOYBOX_LIBCRYPTO
  81  struct ssl_ctx_st *ctx;
  82  struct ssl_st *ssl;
  83#endif
  84)
  85
  86// get http info in URL
  87static void wget_info(char *url, char **host, char **port, char **path)
  88{
  89  char *ss = url;
  90
  91  // Must start with case insensitive http:// or https://
  92  if (strncasecmp(url, "http", 4)) url = 0;
  93  else {
  94    url += 4;
  95    if ((TT.https = WGET_SSL && toupper(*url=='s'))) url++;
  96    if (!strstart(&url, "://")) url = 0;
  97  }
  98  if (!url) error_exit("unsupported protocol: %s", ss);
  99  if ((*path = strchr(*host = url, '/'))) *((*path)++) = 0;
 100  else *path = "";
 101
 102  // Get port number and trim literal IPv6 addresses
 103  if (**host=='[' && (ss = strchr(++*host, ']'))) {
 104    *ss++ = 0;
 105    *port = (*ss==':') ? ++ss : 0;
 106  } else if ((*port = strchr(*host, ':'))) *((*port)++) = 0;
 107  if (!*port) *port = HTTPS ? "443" : "80";
 108}
 109
 110static void wget_connect(char *host, char *port)
 111{
 112  if (!HTTPS)
 113    TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0));
 114  else {
 115#if CFG_WGET_LIBTLS
 116    struct tls_config *cfg = NULL;
 117    uint32_t protocols;
 118    if (!(TT.tls = tls_client()))
 119      error_exit("tls_client: %s", tls_error(TT.tls));
 120    if (!(cfg = tls_config_new()))
 121      error_exit("tls_config_new: %s", tls_config_error(cfg));
 122    if (tls_config_parse_protocols(&protocols, "tlsv1.2"))
 123      error_exit("tls_config_parse_protocols");
 124    if (tls_config_set_protocols(cfg, protocols))
 125      error_exit("tls_config_set_protocols: %s", tls_config_error(cfg));
 126    if (tls_configure(TT.tls, cfg))
 127      error_exit("tls_configure: %s", tls_error(TT.tls));
 128    tls_config_free(cfg);
 129
 130    if (tls_connect(TT.tls, host, port))
 131      error_exit("tls_connect: %s", tls_error(TT.tls));
 132#elif CFG_TOYBOX_LIBCRYPTO
 133    SSL_library_init();
 134    OpenSSL_add_all_algorithms();
 135    SSL_load_error_strings();
 136    ERR_load_crypto_strings();
 137
 138    TT.ctx = SSL_CTX_new(TLS_client_method());
 139    if (!TT.ctx) error_exit("SSL_CTX_new");
 140
 141    TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0));
 142
 143    TT.ssl = SSL_new(TT.ctx);
 144    if (!TT.ssl)
 145      error_exit("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL));
 146
 147    if (!SSL_set_tlsext_host_name(TT.ssl, host))
 148      error_exit("SSL_set_tlsext_host_name: %s",
 149                 ERR_error_string(ERR_get_error(), NULL));
 150
 151    SSL_set_fd(TT.ssl, TT.sock);
 152    if (SSL_connect(TT.ssl) == -1)
 153      error_exit("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL));
 154
 155    if (FLAG(d)) printf("TLS: %s\n", SSL_get_cipher(TT.ssl));
 156#endif
 157  }
 158}
 159
 160static size_t wget_read(void *buf, size_t len)
 161{
 162  if (!HTTPS) return xread(TT.sock, buf, len);
 163  else {
 164    char *err = 0;
 165    int ret;
 166
 167#if CFG_WGET_LIBTLS
 168    if ((ret = tls_read(TT.tls, buf, len))<0) err = tls_error(TT.tls);
 169#elif CFG_TOYBOX_LIBCRYPTO
 170    if ((ret = SSL_read(TT.ssl, buf, len))<0)
 171      err = ERR_error_string(ERR_get_error(), 0);
 172#endif
 173    if (err) error_exit("https read: %s", err);
 174
 175    return ret;
 176  }
 177}
 178
 179static void wget_write(void *buf, size_t len)
 180{
 181  if (!HTTPS) xwrite(TT.sock, buf, len);
 182  else {
 183    char *err = 0;
 184
 185#if CFG_WGET_LIBTLS
 186    if (len != tls_write(TT.tls, buf, len)) err = tls_error(TT.tls);
 187#elif CFG_TOYBOX_LIBCRYPTO
 188    if (len != SSL_write(TT.ssl, buf, len))
 189      err = ERR_error_string(ERR_get_error(), 0);
 190#endif
 191    if (err) error_exit("https write: %s", err);
 192  }
 193}
 194
 195static void wget_close()
 196{
 197  if (TT.sock) {
 198      xclose(TT.sock);
 199      TT.sock = 0;
 200  }
 201
 202#if CFG_WGET_LIBTLS
 203  if (TT.tls) {
 204    tls_close(TT.tls);
 205    tls_free(TT.tls);
 206    TT.tls = 0;
 207  }
 208#elif CFG_TOYBOX_LIBCRYPTO
 209  if (TT.ssl) {
 210    SSL_shutdown(TT.ssl);
 211    SSL_free(TT.ssl);
 212    TT.ssl = 0;
 213  }
 214
 215  if (TT.ctx) {
 216    SSL_CTX_free(TT.ctx);
 217    TT.ctx = 0;
 218  }
 219#endif
 220}
 221
 222static char *wget_find_header(char *header, char *val)
 223{
 224  if (!(header = strcasestr(header, val))) return 0;
 225  header += strlen(val);
 226
 227  return xstrndup(header, strcspn(header, "\r\n"));
 228}
 229
 230void wget_main(void)
 231{
 232  long status = 0;
 233  size_t len, c_len = 0;
 234  int fd = 0, ii;
 235  char *body, *index, *host, *port, *path = 0, *chunked, *ss;
 236  char agent[] = "toybox wget/" TOYBOX_VERSION;
 237
 238  TT.url = escape_url(*toys.optargs, 0);
 239
 240  // Ask server for URL, following redirects until success
 241  while (status != 200) {
 242    if (!TT.max_redirect--) error_exit("Too many redirects");
 243
 244    // Connect and write request
 245    wget_info(TT.url, &host, &port, &path);
 246    if (TT.p) sprintf(toybuf, "Content-Length: %ld\r\n", (long)strlen(TT.p));
 247    ss = xmprintf("%s /%s HTTP/1.1\r\nHost: %s\r\nUser-Agent: %s\r\n"
 248                  "Connection: close\r\n%s\r\n%s", FLAG(p) ? "POST" : "GET",
 249                  path, host, agent, FLAG(p) ? toybuf : "", FLAG(p)?TT.p:"");
 250    if (FLAG(d)) printf("--- Request\n%s", ss);
 251    wget_connect(host, port);
 252    wget_write(ss, strlen(ss));
 253    free(ss);
 254
 255    // Read HTTP response into toybuf (probably with some body at end)
 256    for (index = toybuf;
 257      (len = wget_read(index, sizeof(toybuf)-(index-toybuf)))>0; index += len);
 258
 259    // Split response into header and body, and null terminate header.
 260    // (RFC7230 says header cannot contain NUL.)
 261    if (!(body = memmem(ss = toybuf, index-toybuf, "\r\n\r\n", 4)))
 262      error_exit("response header too large");
 263    *body = 0;
 264    body += 4;
 265    len = index-body;
 266    if (FLAG(d)) printf("--- Response\n%s\n\n", toybuf);
 267
 268    status = strstart(&ss, "HTTP/1.1 ") ? strtol(ss, 0, 10) : 0;
 269    if ((status == 301) || (status == 302)) {
 270      if (!(ss = wget_find_header(toybuf, "Location: ")))
 271        error_exit("bad redirect");
 272      free(TT.url);
 273      TT.url = ss;
 274      wget_close();
 275    } else if (status != 200) error_exit("response %ld", status);
 276  }
 277
 278  // Open output file
 279  if (TT.O && !strcmp(TT.O, "-")) fd = 1;
 280  else if (!TT.O) {
 281    ss = wget_find_header(toybuf, "Content-Disposition: attachment; filename=");
 282    if (ss) {
 283      unescape_url(ss);
 284      for (ii = strlen(ss); ii; ii--) {
 285        if (ss[ii]=='/') memmove(ss, ss+ii, strlen(ss+ii));
 286        break;
 287      }
 288      if (!*ss) {
 289        free(ss);
 290        ss = 0;
 291      }
 292    }
 293    if (!ss) {
 294      path = 0;
 295      for (ii = 0, ss = *toys.optargs; *ss && *ss!='?' && *ss!='#'; ss++)
 296        if (*ss=='/' && ++ii>2) path = ss+1;
 297      ss = (path && ss>path) ? xstrndup(path, ss-path) : 0;
 298      // TODO: handle %20 style escapes
 299    }
 300    if (!ss) ss = "index.html";
 301    if (!access((TT.O = ss), F_OK)) error_exit("%s already exists", TT.O);
 302  }
 303  // TODO: don't allow header/basename to write to stdout
 304  if (!fd) fd = xcreate(TT.O, (O_WRONLY|O_CREAT|O_TRUNC), 0644);
 305
 306  // If chunked we offset the first buffer by 2 character, meaning it is
 307  // pointing at half of the header boundary, aka '\r\n'. This simplifies
 308  // parsing of the first c_len length by allowing the do while loop to fall
 309  // through on the first iteration and parse the first c_len size.
 310  chunked = wget_find_header(toybuf, "transfer-encoding: chunked");
 311  if (chunked) memmove(toybuf, body-2, len += 2);
 312  else memmove(toybuf, body, len);
 313
 314  // len is the size remaining in toybuf
 315  // c_len is the size of the remaining bytes in the current chunk
 316  do {
 317    if (chunked) {
 318      if (c_len > 0) { // We have an incomplete c_len to write
 319        if (len <= c_len) { // Buffer is less than the c_len so full write
 320          xwrite(fd, toybuf, len);
 321          c_len = c_len - len;
 322          len = 0;
 323        } else { // Buffer is larger than the c_len so partial write
 324          xwrite(fd, toybuf, c_len);
 325          len = len - c_len;
 326          memmove(toybuf, toybuf + c_len, len);
 327          c_len = 0;
 328        }
 329      }
 330
 331      // If len is less than 2 we can't validate the chunk boundary so fall
 332      // through and go read more into toybuf.
 333      if (!c_len && (len > 2)) {
 334        char *c;
 335        if (strncmp(toybuf, "\r\n", 2) != 0) error_exit("chunk boundary");
 336
 337        // If we can't find the end of the new chunk signature fall through and
 338        // read more into toybuf.
 339        c = memmem(toybuf + 2, len - 2, "\r\n",2);
 340        if (c) {
 341          c_len = strtol(toybuf + 2, NULL, 16);
 342          if (!c_len) break; // A c_len of zero means we are complete
 343          len = len - (c - toybuf) - 2;
 344          memmove(toybuf, c + 2, len);
 345        }
 346      }
 347
 348      if (len == sizeof(toybuf)) error_exit("chunk overflow");
 349    } else {
 350      xwrite(fd, toybuf, len);
 351      len = 0;
 352    }
 353  } while ((len += wget_read(toybuf + len, sizeof(toybuf) - len)) > 0);
 354
 355  wget_close();
 356  free(TT.url);
 357}
 358