busybox/networking/wget.c
<<
>>
Prefs
   1/* vi: set sw=4 ts=4: */
   2/*
   3 * wget - retrieve a file using HTTP or FTP
   4 *
   5 * Chip Rosenthal Covad Communications <chip@laserlink.net>
   6 * Licensed under GPLv2, see file LICENSE in this source tree.
   7 *
   8 * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
   9 * Kuhn's copyrights are licensed GPLv2-or-later.  File as a whole remains GPLv2.
  10 */
  11
  12//usage:#define wget_trivial_usage
  13//usage:        IF_FEATURE_WGET_LONG_OPTIONS(
  14//usage:       "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
  15//usage:       "        [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
  16/* Since we ignore these opts, we don't show them in --help */
  17/* //usage:    "        [--no-check-certificate] [--no-cache]" */
  18//usage:       "        [-U|--user-agent AGENT]" IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
  19//usage:        )
  20//usage:        IF_NOT_FEATURE_WGET_LONG_OPTIONS(
  21//usage:       "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
  22//usage:                        IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
  23//usage:        )
  24//usage:#define wget_full_usage "\n\n"
  25//usage:       "Retrieve files via HTTP or FTP\n"
  26//usage:     "\n        -s      Spider mode - only check file existence"
  27//usage:     "\n        -c      Continue retrieval of aborted transfer"
  28//usage:     "\n        -q      Quiet"
  29//usage:     "\n        -P DIR  Save to DIR (default .)"
  30//usage:        IF_FEATURE_WGET_TIMEOUT(
  31//usage:     "\n        -T SEC  Network read timeout is SEC seconds"
  32//usage:        )
  33//usage:     "\n        -O FILE Save to FILE ('-' for stdout)"
  34//usage:     "\n        -U STR  Use STR for User-Agent header"
  35//usage:     "\n        -Y      Use proxy ('on' or 'off')"
  36
  37#include "libbb.h"
  38
  39#if 0
  40# define log_io(...) bb_error_msg(__VA_ARGS__)
  41#else
  42# define log_io(...) ((void)0)
  43#endif
  44
  45
  46struct host_info {
  47        char *allocated;
  48        const char *path;
  49        char       *user;
  50        const char *protocol;
  51        char       *host;
  52        int         port;
  53};
  54static const char P_FTP[] = "ftp";
  55static const char P_HTTP[] = "http";
  56static const char P_HTTPS[] = "https";
  57
  58
  59/* Globals */
  60struct globals {
  61        off_t content_len;        /* Content-length of the file */
  62        off_t beg_range;          /* Range at which continue begins */
  63#if ENABLE_FEATURE_WGET_STATUSBAR
  64        off_t transferred;        /* Number of bytes transferred so far */
  65        const char *curfile;      /* Name of current file being transferred */
  66        bb_progress_t pmt;
  67#endif
  68        char *dir_prefix;
  69#if ENABLE_FEATURE_WGET_LONG_OPTIONS
  70        char *post_data;
  71        char *extra_headers;
  72#endif
  73        char *fname_out;        /* where to direct output (-O) */
  74        const char *proxy_flag; /* Use proxies if env vars are set */
  75        const char *user_agent; /* "User-Agent" header field */
  76#if ENABLE_FEATURE_WGET_TIMEOUT
  77        unsigned timeout_seconds;
  78        bool connecting;
  79#endif
  80        int output_fd;
  81        int o_flags;
  82        smallint chunked;         /* chunked transfer encoding */
  83        smallint got_clen;        /* got content-length: from server  */
  84        /* Local downloads do benefit from big buffer.
  85         * With 512 byte buffer, it was measured to be
  86         * an order of magnitude slower than with big one.
  87         */
  88        uint64_t just_to_align_next_member;
  89        char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
  90} FIX_ALIASING;
  91#define G (*ptr_to_globals)
  92#define INIT_G() do { \
  93        SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
  94} while (0)
  95#define FINI_G() do { \
  96        FREE_PTR_TO_GLOBALS(); \
  97} while (0)
  98
  99
 100/* Must match option string! */
 101enum {
 102        WGET_OPT_CONTINUE   = (1 << 0),
 103        WGET_OPT_SPIDER     = (1 << 1),
 104        WGET_OPT_QUIET      = (1 << 2),
 105        WGET_OPT_OUTNAME    = (1 << 3),
 106        WGET_OPT_PREFIX     = (1 << 4),
 107        WGET_OPT_PROXY      = (1 << 5),
 108        WGET_OPT_USER_AGENT = (1 << 6),
 109        WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
 110        WGET_OPT_RETRIES    = (1 << 8),
 111        WGET_OPT_PASSIVE    = (1 << 9),
 112        WGET_OPT_HEADER     = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
 113        WGET_OPT_POST_DATA  = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
 114};
 115
 116enum {
 117        PROGRESS_START = -1,
 118        PROGRESS_END   = 0,
 119        PROGRESS_BUMP  = 1,
 120};
 121#if ENABLE_FEATURE_WGET_STATUSBAR
 122static void progress_meter(int flag)
 123{
 124        if (option_mask32 & WGET_OPT_QUIET)
 125                return;
 126
 127        if (flag == PROGRESS_START)
 128                bb_progress_init(&G.pmt, G.curfile);
 129
 130        bb_progress_update(&G.pmt,
 131                        G.beg_range,
 132                        G.transferred,
 133                        (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
 134        );
 135
 136        if (flag == PROGRESS_END) {
 137                bb_progress_free(&G.pmt);
 138                bb_putchar_stderr('\n');
 139                G.transferred = 0;
 140        }
 141}
 142#else
 143static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
 144#endif
 145
 146
 147/* IPv6 knows scoped address types i.e. link and site local addresses. Link
 148 * local addresses can have a scope identifier to specify the
 149 * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
 150 * identifier is only valid on a single node.
 151 *
 152 * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
 153 * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
 154 * in the Host header as invalid requests, see
 155 * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
 156 */
 157static void strip_ipv6_scope_id(char *host)
 158{
 159        char *scope, *cp;
 160
 161        /* bbox wget actually handles IPv6 addresses without [], like
 162         * wget "http://::1/xxx", but this is not standard.
 163         * To save code, _here_ we do not support it. */
 164
 165        if (host[0] != '[')
 166                return; /* not IPv6 */
 167
 168        scope = strchr(host, '%');
 169        if (!scope)
 170                return;
 171
 172        /* Remove the IPv6 zone identifier from the host address */
 173        cp = strchr(host, ']');
 174        if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
 175                /* malformed address (not "[xx]:nn" or "[xx]") */
 176                return;
 177        }
 178
 179        /* cp points to "]...", scope points to "%eth0]..." */
 180        overlapping_strcpy(scope, cp);
 181}
 182
 183#if ENABLE_FEATURE_WGET_AUTHENTICATION
 184/* Base64-encode character string. */
 185static char *base64enc(const char *str)
 186{
 187        unsigned len = strlen(str);
 188        if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
 189                len = sizeof(G.wget_buf)/4*3 - 10;
 190        bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
 191        return G.wget_buf;
 192}
 193#endif
 194
 195static char* sanitize_string(char *s)
 196{
 197        unsigned char *p = (void *) s;
 198        while (*p >= ' ')
 199                p++;
 200        *p = '\0';
 201        return s;
 202}
 203
 204#if ENABLE_FEATURE_WGET_TIMEOUT
 205static void alarm_handler(int sig UNUSED_PARAM)
 206{
 207        /* This is theoretically unsafe (uses stdio and malloc in signal handler) */
 208        if (G.connecting)
 209                bb_error_msg_and_die("download timed out");
 210}
 211#endif
 212
 213static FILE *open_socket(len_and_sockaddr *lsa)
 214{
 215        int fd;
 216        FILE *fp;
 217
 218        IF_FEATURE_WGET_TIMEOUT(alarm(G.timeout_seconds); G.connecting = 1;)
 219        fd = xconnect_stream(lsa);
 220        IF_FEATURE_WGET_TIMEOUT(G.connecting = 0;)
 221
 222        /* glibc 2.4 seems to try seeking on it - ??! */
 223        /* hopefully it understands what ESPIPE means... */
 224        fp = fdopen(fd, "r+");
 225        if (!fp)
 226                bb_perror_msg_and_die(bb_msg_memory_exhausted);
 227
 228        return fp;
 229}
 230
 231/* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
 232/* FIXME: does not respect FEATURE_WGET_TIMEOUT and -T N: */
 233static char fgets_and_trim(FILE *fp)
 234{
 235        char c;
 236        char *buf_ptr;
 237
 238        if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
 239                bb_perror_msg_and_die("error getting response");
 240
 241        buf_ptr = strchrnul(G.wget_buf, '\n');
 242        c = *buf_ptr;
 243        *buf_ptr = '\0';
 244        buf_ptr = strchrnul(G.wget_buf, '\r');
 245        *buf_ptr = '\0';
 246
 247        log_io("< %s", G.wget_buf);
 248
 249        return c;
 250}
 251
 252static int ftpcmd(const char *s1, const char *s2, FILE *fp)
 253{
 254        int result;
 255        if (s1) {
 256                if (!s2)
 257                        s2 = "";
 258                fprintf(fp, "%s%s\r\n", s1, s2);
 259                fflush(fp);
 260                log_io("> %s%s", s1, s2);
 261        }
 262
 263        do {
 264                fgets_and_trim(fp);
 265        } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
 266
 267        G.wget_buf[3] = '\0';
 268        result = xatoi_positive(G.wget_buf);
 269        G.wget_buf[3] = ' ';
 270        return result;
 271}
 272
 273static void parse_url(const char *src_url, struct host_info *h)
 274{
 275        char *url, *p, *sp;
 276
 277        free(h->allocated);
 278        h->allocated = url = xstrdup(src_url);
 279
 280        h->protocol = P_FTP;
 281        p = strstr(url, "://");
 282        if (p) {
 283                *p = '\0';
 284                h->host = p + 3;
 285                if (strcmp(url, P_FTP) == 0) {
 286                        h->port = bb_lookup_port(P_FTP, "tcp", 21);
 287                } else
 288                if (strcmp(url, P_HTTPS) == 0) {
 289                        h->port = bb_lookup_port(P_HTTPS, "tcp", 443);
 290                        h->protocol = P_HTTPS;
 291                } else
 292                if (strcmp(url, P_HTTP) == 0) {
 293 http:
 294                        h->port = bb_lookup_port(P_HTTP, "tcp", 80);
 295                        h->protocol = P_HTTP;
 296                } else {
 297                        *p = ':';
 298                        bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
 299                }
 300        } else {
 301                // GNU wget is user-friendly and falls back to http://
 302                h->host = url;
 303                goto http;
 304        }
 305
 306        // FYI:
 307        // "Real" wget 'http://busybox.net?var=a/b' sends this request:
 308        //   'GET /?var=a/b HTTP 1.0'
 309        //   and saves 'index.html?var=a%2Fb' (we save 'b')
 310        // wget 'http://busybox.net?login=john@doe':
 311        //   request: 'GET /?login=john@doe HTTP/1.0'
 312        //   saves: 'index.html?login=john@doe' (we save '?login=john@doe')
 313        // wget 'http://busybox.net#test/test':
 314        //   request: 'GET / HTTP/1.0'
 315        //   saves: 'index.html' (we save 'test')
 316        //
 317        // We also don't add unique .N suffix if file exists...
 318        sp = strchr(h->host, '/');
 319        p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
 320        p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
 321        if (!sp) {
 322                h->path = "";
 323        } else if (*sp == '/') {
 324                *sp = '\0';
 325                h->path = sp + 1;
 326        } else { // '#' or '?'
 327                // http://busybox.net?login=john@doe is a valid URL
 328                // memmove converts to:
 329                // http:/busybox.nett?login=john@doe...
 330                memmove(h->host - 1, h->host, sp - h->host);
 331                h->host--;
 332                sp[-1] = '\0';
 333                h->path = sp;
 334        }
 335
 336        sp = strrchr(h->host, '@');
 337        if (sp != NULL) {
 338                // URL-decode "user:password" string before base64-encoding:
 339                // wget http://test:my%20pass@example.com should send
 340                // Authorization: Basic dGVzdDpteSBwYXNz
 341                // which decodes to "test:my pass".
 342                // Standard wget and curl do this too.
 343                *sp = '\0';
 344                free(h->user);
 345                h->user = xstrdup(percent_decode_in_place(h->host, /*strict:*/ 0));
 346                h->host = sp + 1;
 347        }
 348        /* else: h->user remains NULL, or as set by original request
 349         * before redirect (if we are here after a redirect).
 350         */
 351}
 352
 353static char *gethdr(FILE *fp)
 354{
 355        char *s, *hdrval;
 356        int c;
 357
 358        /* retrieve header line */
 359        c = fgets_and_trim(fp);
 360
 361        /* end of the headers? */
 362        if (G.wget_buf[0] == '\0')
 363                return NULL;
 364
 365        /* convert the header name to lower case */
 366        for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.' || *s == '_'; ++s) {
 367                /*
 368                 * No-op for 20-3f and 60-7f. "0-9a-z-." are in these ranges.
 369                 * 40-5f range ("@A-Z[\]^_") maps to 60-7f.
 370                 * "A-Z" maps to "a-z".
 371                 * "@[\]" can't occur in header names.
 372                 * "^_" maps to "~,DEL" (which is wrong).
 373                 * "^" was never seen yet, "_" was seen from web.archive.org
 374                 * (x-archive-orig-x_commoncrawl_Signature: HEXSTRING).
 375                 */
 376                *s |= 0x20;
 377        }
 378
 379        /* verify we are at the end of the header name */
 380        if (*s != ':')
 381                bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
 382
 383        /* locate the start of the header value */
 384        *s++ = '\0';
 385        hdrval = skip_whitespace(s);
 386
 387        if (c != '\n') {
 388                /* Rats! The buffer isn't big enough to hold the entire header value */
 389                while (c = getc(fp), c != EOF && c != '\n')
 390                        continue;
 391        }
 392
 393        return hdrval;
 394}
 395
 396static void reset_beg_range_to_zero(void)
 397{
 398        bb_error_msg("restart failed");
 399        G.beg_range = 0;
 400        xlseek(G.output_fd, 0, SEEK_SET);
 401        /* Done at the end instead: */
 402        /* ftruncate(G.output_fd, 0); */
 403}
 404
 405static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
 406{
 407        FILE *sfp;
 408        char *str;
 409        int port;
 410
 411        if (!target->user)
 412                target->user = xstrdup("anonymous:busybox@");
 413
 414        sfp = open_socket(lsa);
 415        if (ftpcmd(NULL, NULL, sfp) != 220)
 416                bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
 417
 418        /*
 419         * Splitting username:password pair,
 420         * trying to log in
 421         */
 422        str = strchr(target->user, ':');
 423        if (str)
 424                *str++ = '\0';
 425        switch (ftpcmd("USER ", target->user, sfp)) {
 426        case 230:
 427                break;
 428        case 331:
 429                if (ftpcmd("PASS ", str, sfp) == 230)
 430                        break;
 431                /* fall through (failed login) */
 432        default:
 433                bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
 434        }
 435
 436        ftpcmd("TYPE I", NULL, sfp);
 437
 438        /*
 439         * Querying file size
 440         */
 441        if (ftpcmd("SIZE ", target->path, sfp) == 213) {
 442                G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
 443                if (G.content_len < 0 || errno) {
 444                        bb_error_msg_and_die("SIZE value is garbage");
 445                }
 446                G.got_clen = 1;
 447        }
 448
 449        /*
 450         * Entering passive mode
 451         */
 452        if (ftpcmd("PASV", NULL, sfp) != 227) {
 453 pasv_error:
 454                bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
 455        }
 456        // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
 457        // Server's IP is N1.N2.N3.N4 (we ignore it)
 458        // Server's port for data connection is P1*256+P2
 459        str = strrchr(G.wget_buf, ')');
 460        if (str) str[0] = '\0';
 461        str = strrchr(G.wget_buf, ',');
 462        if (!str) goto pasv_error;
 463        port = xatou_range(str+1, 0, 255);
 464        *str = '\0';
 465        str = strrchr(G.wget_buf, ',');
 466        if (!str) goto pasv_error;
 467        port += xatou_range(str+1, 0, 255) * 256;
 468        set_nport(&lsa->u.sa, htons(port));
 469
 470        *dfpp = open_socket(lsa);
 471
 472        if (G.beg_range != 0) {
 473                sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
 474                if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
 475                        G.content_len -= G.beg_range;
 476                else
 477                        reset_beg_range_to_zero();
 478        }
 479
 480        if (ftpcmd("RETR ", target->path, sfp) > 150)
 481                bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
 482
 483        return sfp;
 484}
 485
 486static int spawn_https_helper(const char *host, unsigned port)
 487{
 488        char *allocated = NULL;
 489        int sp[2];
 490        int pid;
 491
 492        if (socketpair(AF_UNIX, SOCK_STREAM, 0, sp) != 0)
 493                /* Kernel can have AF_UNIX support disabled */
 494                bb_perror_msg_and_die("socketpair");
 495
 496        if (!strchr(host, ':'))
 497                host = allocated = xasprintf("%s:%u", host, port);
 498
 499        pid = BB_MMU ? xfork() : xvfork();
 500        if (pid == 0) {
 501                /* Child */
 502                char *argv[6];
 503
 504                close(sp[0]);
 505                xmove_fd(sp[1], 0);
 506                xdup2(0, 1);
 507                /*
 508                 * TODO: develop a tiny ssl/tls helper (using matrixssl?),
 509                 * try to exec it here before falling back to big fat openssl.
 510                 */
 511                /*
 512                 * openssl s_client -quiet -connect www.kernel.org:443 2>/dev/null
 513                 * It prints some debug stuff on stderr, don't know how to suppress it.
 514                 * Work around by dev-nulling stderr. We lose all error messages :(
 515                 */
 516                xmove_fd(2, 3);
 517                xopen("/dev/null", O_RDWR);
 518                argv[0] = (char*)"openssl";
 519                argv[1] = (char*)"s_client";
 520                argv[2] = (char*)"-quiet";
 521                argv[3] = (char*)"-connect";
 522                argv[4] = (char*)host;
 523                argv[5] = NULL;
 524                BB_EXECVP(argv[0], argv);
 525                xmove_fd(3, 2);
 526                bb_perror_msg_and_die("can't execute '%s'", argv[0]);
 527                /* notreached */
 528        }
 529
 530        /* Parent */
 531        free(allocated);
 532        close(sp[1]);
 533        return sp[0];
 534}
 535
 536/* See networking/ssl_helper/README */
 537#define SSL_HELPER 0
 538
 539#if SSL_HELPER
 540static void spawn_https_helper1(int network_fd)
 541{
 542        int sp[2];
 543        int pid;
 544
 545        if (socketpair(AF_UNIX, SOCK_STREAM, 0, sp) != 0)
 546                /* Kernel can have AF_UNIX support disabled */
 547                bb_perror_msg_and_die("socketpair");
 548
 549        pid = BB_MMU ? xfork() : xvfork();
 550        if (pid == 0) {
 551                /* Child */
 552                char *argv[3];
 553
 554                close(sp[0]);
 555                xmove_fd(sp[1], 0);
 556                xdup2(0, 1);
 557                xmove_fd(network_fd, 3);
 558                /*
 559                 * A simple ssl/tls helper
 560                 */
 561                argv[0] = (char*)"ssl_helper";
 562                argv[1] = (char*)"-d3";
 563                argv[2] = NULL;
 564                BB_EXECVP(argv[0], argv);
 565                bb_perror_msg_and_die("can't execute '%s'", argv[0]);
 566                /* notreached */
 567        }
 568
 569        /* Parent */
 570        close(sp[1]);
 571        xmove_fd(sp[0], network_fd);
 572}
 573#endif
 574
 575static void NOINLINE retrieve_file_data(FILE *dfp)
 576{
 577#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
 578# if ENABLE_FEATURE_WGET_TIMEOUT
 579        unsigned second_cnt = G.timeout_seconds;
 580# endif
 581        struct pollfd polldata;
 582
 583        polldata.fd = fileno(dfp);
 584        polldata.events = POLLIN | POLLPRI;
 585#endif
 586        progress_meter(PROGRESS_START);
 587
 588        if (G.chunked)
 589                goto get_clen;
 590
 591        /* Loops only if chunked */
 592        while (1) {
 593
 594#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
 595                /* Must use nonblocking I/O, otherwise fread will loop
 596                 * and *block* until it reads full buffer,
 597                 * which messes up progress bar and/or timeout logic.
 598                 * Because of nonblocking I/O, we need to dance
 599                 * very carefully around EAGAIN. See explanation at
 600                 * clearerr() calls.
 601                 */
 602                ndelay_on(polldata.fd);
 603#endif
 604                while (1) {
 605                        int n;
 606                        unsigned rdsz;
 607
 608#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
 609                        /* fread internally uses read loop, which in our case
 610                         * is usually exited when we get EAGAIN.
 611                         * In this case, libc sets error marker on the stream.
 612                         * Need to clear it before next fread to avoid possible
 613                         * rare false positive ferror below. Rare because usually
 614                         * fread gets more than zero bytes, and we don't fall
 615                         * into if (n <= 0) ...
 616                         */
 617                        clearerr(dfp);
 618#endif
 619                        errno = 0;
 620                        rdsz = sizeof(G.wget_buf);
 621                        if (G.got_clen) {
 622                                if (G.content_len < (off_t)sizeof(G.wget_buf)) {
 623                                        if ((int)G.content_len <= 0)
 624                                                break;
 625                                        rdsz = (unsigned)G.content_len;
 626                                }
 627                        }
 628                        n = fread(G.wget_buf, 1, rdsz, dfp);
 629
 630                        if (n > 0) {
 631                                xwrite(G.output_fd, G.wget_buf, n);
 632#if ENABLE_FEATURE_WGET_STATUSBAR
 633                                G.transferred += n;
 634#endif
 635                                if (G.got_clen) {
 636                                        G.content_len -= n;
 637                                        if (G.content_len == 0)
 638                                                break;
 639                                }
 640#if ENABLE_FEATURE_WGET_TIMEOUT
 641                                second_cnt = G.timeout_seconds;
 642#endif
 643                                goto bump;
 644                        }
 645
 646                        /* n <= 0.
 647                         * man fread:
 648                         * If error occurs, or EOF is reached, the return value
 649                         * is a short item count (or zero).
 650                         * fread does not distinguish between EOF and error.
 651                         */
 652                        if (errno != EAGAIN) {
 653                                if (ferror(dfp)) {
 654                                        progress_meter(PROGRESS_END);
 655                                        bb_perror_msg_and_die(bb_msg_read_error);
 656                                }
 657                                break; /* EOF, not error */
 658                        }
 659
 660#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
 661                        /* It was EAGAIN. There is no data. Wait up to one second
 662                         * then abort if timed out, or update the bar and try reading again.
 663                         */
 664                        if (safe_poll(&polldata, 1, 1000) == 0) {
 665# if ENABLE_FEATURE_WGET_TIMEOUT
 666                                if (second_cnt != 0 && --second_cnt == 0) {
 667                                        progress_meter(PROGRESS_END);
 668                                        bb_error_msg_and_die("download timed out");
 669                                }
 670# endif
 671                                /* We used to loop back to poll here,
 672                                 * but there is no great harm in letting fread
 673                                 * to try reading anyway.
 674                                 */
 675                        }
 676#endif
 677 bump:
 678                        /* Need to do it _every_ second for "stalled" indicator
 679                         * to be shown properly.
 680                         */
 681                        progress_meter(PROGRESS_BUMP);
 682                } /* while (reading data) */
 683
 684#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
 685                clearerr(dfp);
 686                ndelay_off(polldata.fd); /* else fgets can get very unhappy */
 687#endif
 688                if (!G.chunked)
 689                        break;
 690
 691                fgets_and_trim(dfp); /* Eat empty line */
 692 get_clen:
 693                fgets_and_trim(dfp);
 694                G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
 695                /* FIXME: error check? */
 696                if (G.content_len == 0)
 697                        break; /* all done! */
 698                G.got_clen = 1;
 699                /*
 700                 * Note that fgets may result in some data being buffered in dfp.
 701                 * We loop back to fread, which will retrieve this data.
 702                 * Also note that code has to be arranged so that fread
 703                 * is done _before_ one-second poll wait - poll doesn't know
 704                 * about stdio buffering and can result in spurious one second waits!
 705                 */
 706        }
 707
 708        /* If -c failed, we restart from the beginning,
 709         * but we do not truncate file then, we do it only now, at the end.
 710         * This lets user to ^C if his 99% complete 10 GB file download
 711         * failed to restart *without* losing the almost complete file.
 712         */
 713        {
 714                off_t pos = lseek(G.output_fd, 0, SEEK_CUR);
 715                if (pos != (off_t)-1)
 716                        ftruncate(G.output_fd, pos);
 717        }
 718
 719        /* Draw full bar and free its resources */
 720        G.chunked = 0;  /* makes it show 100% even for chunked download */
 721        G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
 722        progress_meter(PROGRESS_END);
 723}
 724
 725static void download_one_url(const char *url)
 726{
 727        bool use_proxy;                 /* Use proxies if env vars are set  */
 728        int redir_limit;
 729        len_and_sockaddr *lsa;
 730        FILE *sfp;                      /* socket to web/ftp server         */
 731        FILE *dfp;                      /* socket to ftp server (data)      */
 732        char *proxy = NULL;
 733        char *fname_out_alloc;
 734        char *redirected_path = NULL;
 735        struct host_info server;
 736        struct host_info target;
 737
 738        server.allocated = NULL;
 739        target.allocated = NULL;
 740        server.user = NULL;
 741        target.user = NULL;
 742
 743        parse_url(url, &target);
 744
 745        /* Use the proxy if necessary */
 746        use_proxy = (strcmp(G.proxy_flag, "off") != 0);
 747        if (use_proxy) {
 748                proxy = getenv(target.protocol == P_FTP ? "ftp_proxy" : "http_proxy");
 749//FIXME: what if protocol is https? Ok to use http_proxy?
 750                use_proxy = (proxy && proxy[0]);
 751                if (use_proxy)
 752                        parse_url(proxy, &server);
 753        }
 754        if (!use_proxy) {
 755                server.port = target.port;
 756                if (ENABLE_FEATURE_IPV6) {
 757                        //free(server.allocated); - can't be non-NULL
 758                        server.host = server.allocated = xstrdup(target.host);
 759                } else {
 760                        server.host = target.host;
 761                }
 762        }
 763
 764        if (ENABLE_FEATURE_IPV6)
 765                strip_ipv6_scope_id(target.host);
 766
 767        /* If there was no -O FILE, guess output filename */
 768        fname_out_alloc = NULL;
 769        if (!(option_mask32 & WGET_OPT_OUTNAME)) {
 770                G.fname_out = bb_get_last_path_component_nostrip(target.path);
 771                /* handle "wget http://kernel.org//" */
 772                if (G.fname_out[0] == '/' || !G.fname_out[0])
 773                        G.fname_out = (char*)"index.html";
 774                /* -P DIR is considered only if there was no -O FILE */
 775                if (G.dir_prefix)
 776                        G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
 777                else {
 778                        /* redirects may free target.path later, need to make a copy */
 779                        G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
 780                }
 781        }
 782#if ENABLE_FEATURE_WGET_STATUSBAR
 783        G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
 784#endif
 785
 786        /* Determine where to start transfer */
 787        G.beg_range = 0;
 788        if (option_mask32 & WGET_OPT_CONTINUE) {
 789                G.output_fd = open(G.fname_out, O_WRONLY);
 790                if (G.output_fd >= 0) {
 791                        G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
 792                }
 793                /* File doesn't exist. We do not create file here yet.
 794                 * We are not sure it exists on remote side */
 795        }
 796
 797        redir_limit = 5;
 798 resolve_lsa:
 799        lsa = xhost2sockaddr(server.host, server.port);
 800        if (!(option_mask32 & WGET_OPT_QUIET)) {
 801                char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
 802                fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
 803                free(s);
 804        }
 805 establish_session:
 806        /*G.content_len = 0; - redundant, got_clen = 0 is enough */
 807        G.got_clen = 0;
 808        G.chunked = 0;
 809        if (use_proxy || target.protocol != P_FTP) {
 810                /*
 811                 *  HTTP session
 812                 */
 813                char *str;
 814                int status;
 815
 816                /* Open socket to http(s) server */
 817                if (target.protocol == P_HTTPS) {
 818/* openssl-based helper
 819 * Inconvenient API since we can't give it an open fd
 820 */
 821                        int fd = spawn_https_helper(server.host, server.port);
 822                        sfp = fdopen(fd, "r+");
 823                        if (!sfp)
 824                                bb_perror_msg_and_die(bb_msg_memory_exhausted);
 825                } else
 826                        sfp = open_socket(lsa);
 827#if SSL_HELPER
 828                if (target.protocol == P_HTTPS)
 829                        spawn_https_helper1(fileno(sfp));
 830#endif
 831                /* Send HTTP request */
 832                if (use_proxy) {
 833                        fprintf(sfp, "GET %s://%s/%s HTTP/1.1\r\n",
 834                                target.protocol, target.host,
 835                                target.path);
 836                } else {
 837                        fprintf(sfp, "%s /%s HTTP/1.1\r\n",
 838                                (option_mask32 & WGET_OPT_POST_DATA) ? "POST" : "GET",
 839                                target.path);
 840                }
 841
 842                fprintf(sfp, "Host: %s\r\nUser-Agent: %s\r\n",
 843                        target.host, G.user_agent);
 844
 845                /* Ask server to close the connection as soon as we are done
 846                 * (IOW: we do not intend to send more requests)
 847                 */
 848                fprintf(sfp, "Connection: close\r\n");
 849
 850#if ENABLE_FEATURE_WGET_AUTHENTICATION
 851                if (target.user) {
 852                        fprintf(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
 853                                base64enc(target.user));
 854                }
 855                if (use_proxy && server.user) {
 856                        fprintf(sfp, "Proxy-Authorization: Basic %s\r\n",
 857                                base64enc(server.user));
 858                }
 859#endif
 860
 861                if (G.beg_range != 0)
 862                        fprintf(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
 863
 864#if ENABLE_FEATURE_WGET_LONG_OPTIONS
 865                if (G.extra_headers)
 866                        fputs(G.extra_headers, sfp);
 867
 868                if (option_mask32 & WGET_OPT_POST_DATA) {
 869                        fprintf(sfp,
 870                                "Content-Type: application/x-www-form-urlencoded\r\n"
 871                                "Content-Length: %u\r\n"
 872                                "\r\n"
 873                                "%s",
 874                                (int) strlen(G.post_data), G.post_data
 875                        );
 876                } else
 877#endif
 878                {
 879                        fprintf(sfp, "\r\n");
 880                }
 881
 882                fflush(sfp);
 883
 884                /*
 885                 * Retrieve HTTP response line and check for "200" status code.
 886                 */
 887 read_response:
 888                fgets_and_trim(sfp);
 889
 890                str = G.wget_buf;
 891                str = skip_non_whitespace(str);
 892                str = skip_whitespace(str);
 893                // FIXME: no error check
 894                // xatou wouldn't work: "200 OK"
 895                status = atoi(str);
 896                switch (status) {
 897                case 0:
 898                case 100:
 899                        while (gethdr(sfp) != NULL)
 900                                /* eat all remaining headers */;
 901                        goto read_response;
 902                case 200:
 903/*
 904Response 204 doesn't say "null file", it says "metadata
 905has changed but data didn't":
 906
 907"10.2.5 204 No Content
 908The server has fulfilled the request but does not need to return
 909an entity-body, and might want to return updated metainformation.
 910The response MAY include new or updated metainformation in the form
 911of entity-headers, which if present SHOULD be associated with
 912the requested variant.
 913
 914If the client is a user agent, it SHOULD NOT change its document
 915view from that which caused the request to be sent. This response
 916is primarily intended to allow input for actions to take place
 917without causing a change to the user agent's active document view,
 918although any new or updated metainformation SHOULD be applied
 919to the document currently in the user agent's active view.
 920
 921The 204 response MUST NOT include a message-body, and thus
 922is always terminated by the first empty line after the header fields."
 923
 924However, in real world it was observed that some web servers
 925(e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
 926*/
 927                case 204:
 928                        if (G.beg_range != 0) {
 929                                /* "Range:..." was not honored by the server.
 930                                 * Restart download from the beginning.
 931                                 */
 932                                reset_beg_range_to_zero();
 933                        }
 934                        break;
 935                case 300:  /* redirection */
 936                case 301:
 937                case 302:
 938                case 303:
 939                        break;
 940                case 206: /* Partial Content */
 941                        if (G.beg_range != 0)
 942                                /* "Range:..." worked. Good. */
 943                                break;
 944                        /* Partial Content even though we did not ask for it??? */
 945                        /* fall through */
 946                default:
 947                        bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
 948                }
 949
 950                /*
 951                 * Retrieve HTTP headers.
 952                 */
 953                while ((str = gethdr(sfp)) != NULL) {
 954                        static const char keywords[] ALIGN1 =
 955                                "content-length\0""transfer-encoding\0""location\0";
 956                        enum {
 957                                KEY_content_length = 1, KEY_transfer_encoding, KEY_location
 958                        };
 959                        smalluint key;
 960
 961                        /* gethdr converted "FOO:" string to lowercase */
 962
 963                        /* strip trailing whitespace */
 964                        char *s = strchrnul(str, '\0') - 1;
 965                        while (s >= str && (*s == ' ' || *s == '\t')) {
 966                                *s = '\0';
 967                                s--;
 968                        }
 969                        key = index_in_strings(keywords, G.wget_buf) + 1;
 970                        if (key == KEY_content_length) {
 971                                G.content_len = BB_STRTOOFF(str, NULL, 10);
 972                                if (G.content_len < 0 || errno) {
 973                                        bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
 974                                }
 975                                G.got_clen = 1;
 976                                continue;
 977                        }
 978                        if (key == KEY_transfer_encoding) {
 979                                if (strcmp(str_tolower(str), "chunked") != 0)
 980                                        bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
 981                                G.chunked = 1;
 982                        }
 983                        if (key == KEY_location && status >= 300) {
 984                                if (--redir_limit == 0)
 985                                        bb_error_msg_and_die("too many redirections");
 986                                fclose(sfp);
 987                                if (str[0] == '/') {
 988                                        free(redirected_path);
 989                                        target.path = redirected_path = xstrdup(str+1);
 990                                        /* lsa stays the same: it's on the same server */
 991                                } else {
 992                                        parse_url(str, &target);
 993                                        if (!use_proxy) {
 994                                                /* server.user remains untouched */
 995                                                free(server.allocated);
 996                                                server.allocated = NULL;
 997                                                server.host = target.host;
 998                                                /* strip_ipv6_scope_id(target.host); - no! */
 999                                                /* we assume remote never gives us IPv6 addr with scope id */
1000                                                server.port = target.port;
1001                                                free(lsa);
1002                                                goto resolve_lsa;
1003                                        } /* else: lsa stays the same: we use proxy */
1004                                }
1005                                goto establish_session;
1006                        }
1007                }
1008//              if (status >= 300)
1009//                      bb_error_msg_and_die("bad redirection (no Location: header from server)");
1010
1011                /* For HTTP, data is pumped over the same connection */
1012                dfp = sfp;
1013
1014        } else {
1015                /*
1016                 *  FTP session
1017                 */
1018                sfp = prepare_ftp_session(&dfp, &target, lsa);
1019        }
1020
1021        free(lsa);
1022
1023        if (!(option_mask32 & WGET_OPT_SPIDER)) {
1024                if (G.output_fd < 0)
1025                        G.output_fd = xopen(G.fname_out, G.o_flags);
1026                retrieve_file_data(dfp);
1027                if (!(option_mask32 & WGET_OPT_OUTNAME)) {
1028                        xclose(G.output_fd);
1029                        G.output_fd = -1;
1030                }
1031        }
1032
1033        if (dfp != sfp) {
1034                /* It's ftp. Close data connection properly */
1035                fclose(dfp);
1036                if (ftpcmd(NULL, NULL, sfp) != 226)
1037                        bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
1038                /* ftpcmd("QUIT", NULL, sfp); - why bother? */
1039        }
1040        fclose(sfp);
1041
1042        free(server.allocated);
1043        free(target.allocated);
1044        free(server.user);
1045        free(target.user);
1046        free(fname_out_alloc);
1047        free(redirected_path);
1048}
1049
1050int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
1051int wget_main(int argc UNUSED_PARAM, char **argv)
1052{
1053#if ENABLE_FEATURE_WGET_LONG_OPTIONS
1054        static const char wget_longopts[] ALIGN1 =
1055                /* name, has_arg, val */
1056                "continue\0"         No_argument       "c"
1057//FIXME: -s isn't --spider, it's --save-headers!
1058                "spider\0"           No_argument       "s"
1059                "quiet\0"            No_argument       "q"
1060                "output-document\0"  Required_argument "O"
1061                "directory-prefix\0" Required_argument "P"
1062                "proxy\0"            Required_argument "Y"
1063                "user-agent\0"       Required_argument "U"
1064#if ENABLE_FEATURE_WGET_TIMEOUT
1065                "timeout\0"          Required_argument "T"
1066#endif
1067                /* Ignored: */
1068                // "tries\0"            Required_argument "t"
1069                /* Ignored (we always use PASV): */
1070                "passive-ftp\0"      No_argument       "\xff"
1071                "header\0"           Required_argument "\xfe"
1072                "post-data\0"        Required_argument "\xfd"
1073                /* Ignored (we don't do ssl) */
1074                "no-check-certificate\0" No_argument   "\xfc"
1075                /* Ignored (we don't support caching) */
1076                "no-cache\0"         No_argument       "\xfb"
1077                ;
1078#endif
1079
1080#if ENABLE_FEATURE_WGET_LONG_OPTIONS
1081        llist_t *headers_llist = NULL;
1082#endif
1083
1084        INIT_G();
1085
1086#if ENABLE_FEATURE_WGET_TIMEOUT
1087        G.timeout_seconds = 900;
1088        signal(SIGALRM, alarm_handler);
1089#endif
1090        G.proxy_flag = "on";   /* use proxies if env vars are set */
1091        G.user_agent = "Wget"; /* "User-Agent" header field */
1092
1093#if ENABLE_FEATURE_WGET_LONG_OPTIONS
1094        applet_long_options = wget_longopts;
1095#endif
1096        opt_complementary = "-1" IF_FEATURE_WGET_TIMEOUT(":T+") IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
1097        getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
1098                &G.fname_out, &G.dir_prefix,
1099                &G.proxy_flag, &G.user_agent,
1100                IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
1101                NULL /* -t RETRIES */
1102                IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
1103                IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
1104        );
1105        argv += optind;
1106
1107#if ENABLE_FEATURE_WGET_LONG_OPTIONS
1108        if (headers_llist) {
1109                int size = 1;
1110                char *cp;
1111                llist_t *ll = headers_llist;
1112                while (ll) {
1113                        size += strlen(ll->data) + 2;
1114                        ll = ll->link;
1115                }
1116                G.extra_headers = cp = xmalloc(size);
1117                while (headers_llist) {
1118                        cp += sprintf(cp, "%s\r\n", (char*)llist_pop(&headers_llist));
1119                }
1120        }
1121#endif
1122
1123        G.output_fd = -1;
1124        G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
1125        if (G.fname_out) { /* -O FILE ? */
1126                if (LONE_DASH(G.fname_out)) { /* -O - ? */
1127                        G.output_fd = 1;
1128                        option_mask32 &= ~WGET_OPT_CONTINUE;
1129                }
1130                /* compat with wget: -O FILE can overwrite */
1131                G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
1132        }
1133
1134        while (*argv)
1135                download_one_url(*argv++);
1136
1137        if (G.output_fd >= 0)
1138                xclose(G.output_fd);
1139
1140#if ENABLE_FEATURE_CLEAN_UP && ENABLE_FEATURE_WGET_LONG_OPTIONS
1141        free(G.extra_headers);
1142#endif
1143        FINI_G();
1144
1145        return EXIT_SUCCESS;
1146}
1147