ExtensionCrawler/crawler

#!/usr/bin/env python3.6
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
"""
A crawler for extensions from the Chrome Web Store.
"""

import os
import sys
import datetime
import time
import getopt
import logging
import itertools
from functools import reduce
from ExtensionCrawler.discover import get_new_ids
from ExtensionCrawler.archive import get_forum_ext_ids, get_existing_ids, update_extensions
from ExtensionCrawler.config import *
from ExtensionCrawler.util import log_info, log_exception


def write_log(dirname, fname, text):
    """Write text into the file with name fname in directory dirname."""
    os.makedirs(dirname, exist_ok=True)
    with open(os.path.join(dirname, fname), 'w') as logfile:
        logfile.write(text)


def log_failures_to_file(dirname, today, res):
    """Log failures during download/update in the log directory dirname."""
    not_authorized = reduce(lambda x, y: x + "\n" + y,
                            sorted(
                                map(lambda x: x.id,
                                    filter(lambda x: x.not_authorized(),
                                           res))), "")
    write_log(dirname, today + "-not-authorized.log", not_authorized)
    updated = reduce(
        lambda x, y: x + "\n" + y,
        sorted(
            map(lambda x: x.id,
                filter(lambda x: x.is_ok() and not x.not_modified(),
                       res))), "")
    write_log(dirname, today + "-updated.log", updated)
    has_exception = reduce(lambda x, y: x + "\n" + y,
                           sorted(
                               map(lambda x: x.id,
                                   filter(lambda x: x.has_exception(), res))),
                           "")
    write_log(dirname, today + "-raised-exception.log", has_exception)
    raised_ddos = reduce(lambda x, y: x + "\n" + y,
                         sorted(
                             map(lambda x: x.id,
                                 filter(lambda x: x.raised_google_ddos(),
                                        res))), "")
    write_log(dirname, today + "-raised-ddos.log", raised_ddos)
    not_in_store = reduce(lambda x, y: x + "\n" + y,
                          sorted(
                              map(lambda x: x.id,
                                  filter(lambda x: x.not_in_store(), res))),
                          "")
    write_log(dirname, today + "-not-in-store.log", not_in_store)
    new = reduce(lambda x, y: x + "\n" + y,
                 sorted(
                     map(lambda x: x.id, filter(lambda x: x.is_new(), res))),
                 "")
    write_log(dirname, today + "-new-in-store.log", new)
    file_corruption = reduce(lambda x, y: x + "\n" + y,
                             sorted(
                                 map(lambda x: x.id,
                                     filter(lambda x: x.corrupt_tar(), res))),
                             "")
    write_log(dirname, today + "-file-corruption.log", file_corruption)

    sql_exception = reduce(lambda x, y: x + "\n" + y,
                           sorted(
                               map(lambda x: x.id,
                                   filter(lambda x: x.sql_exception(), res))),
                           "")
    write_log(dirname, today + "-sql-exception.log", sql_exception)

    sql_success = reduce(lambda x, y: x + "\n" + y,
                         sorted(
                             map(lambda x: x.id,
                                 filter(lambda x: not x.sql_success(), res))),
                         "")
    write_log(dirname, today + "-sql-not-updated.log", sql_success)


def log_summary(res, runtime=0):
    """Log brief result summary."""

    corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))

    log_info("Summary:")
    log_info("    Updated {} out of {} extensions successfully".format(
        str(len(list(filter(lambda x: x.is_ok(), res)))), str(len(res))))
    log_info("    Updated extensions:      {:8d}".format(
        len(list(filter(lambda x: x.is_ok() and not x.not_modified(), res)))))
    log_info("    Updated SQL databases:   {:8d}".format(
        len(list(filter(lambda x: x.sql_success(), res)))))
    log_info("    New extensions:          {:8d}".format(
        len(list(filter(lambda x: x.is_new(), res)))))
    log_info("    Not authorized:          {:8d}".format(
        len(list(filter(lambda x: x.not_authorized(), res)))))
    log_info("    Raised Google DDOS:      {:8d}".format(
        len(list(filter(lambda x: x.raised_google_ddos(), res)))))
    log_info("    Not modified archives:   {:8d}".format(
        len(list(filter(lambda x: x.not_modified(), res)))))
    log_info("    Extensions not in store: {:8d}".format(
        len(list(filter(lambda x: x.not_in_store(), res)))))
    log_info("    Unknown exception:       {:8d}".format(
        len(list(filter(lambda x: x.has_exception(), res)))))
    log_info("    Corrupt tar archives:    {:8d}".format(
        len(corrupt_tar_archives)))
    log_info("    SQL exception:           {:8d}".format(
        len(list(filter(lambda x: x.sql_exception(), res)))))
    log_info("    Total runtime:            {}".format(
        str(datetime.timedelta(seconds=int(runtime)))))

    if corrupt_tar_archives != []:
        log_info("")
        log_info("List of extensions with corrupted files/archives:")
        list(
            map(lambda x: log_info("    " + x.id + ": " + str(x.exception)),
                corrupt_tar_archives))
        log_info("")


def helpmsg():
    """Print help message."""
    print("crawler [OPTION]")
    print("    -h                  print this help text")
    print("    -s                  silent (no log messages)")
    print("    -d                  discover new extensions")
    print("    -p <N>              number of concurrent downloads")
    print("    -P                  use ProcessPool (default: Pool) for concurrency")
    print(
        "    -F                  do not download extensions with forums (skip sequential download)"
    )
    print(
        "    -N                  do not download extensions without forums (skip concurrent download)"
    )
    print("    -a <DIR>            archive directory")
    print(
        "    -t <N>              timeout for an individual extension download")
    print("    --max-discover <N>  discover at most N new extensions")


def print_config(basedir, archive_dir, conf_dir, discover, parallel,
                 download_ext_ids_without_forums, download_ext_ids_with_forums,
                 ext_timeout, use_process_pool):
    """Print current configuration."""
    log_info("Configuration:")
    log_info("  Base dir:                         {}".format(basedir))
    log_info("    Archive directory:              {}".format(archive_dir))
    log_info("    Configuration directory:        {}".format(conf_dir))
    log_info("  Discover new extensions:          {}".format(discover))
    log_info("  Download ext. without forums:     {}".format(
        download_ext_ids_without_forums))
    log_info("  Download ext. with forums:        {}".format(
        download_ext_ids_with_forums))
    log_info("  Max num. of concurrent downloads: {}".format(parallel))
    log_info("  Use ProcessPool:                  {}".format(use_process_pool))
    log_info("  Download timeout:                 {}".format(ext_timeout))


def parse_args(argv):
    """Parse command line arguments. """
    basedir = const_basedir()
    parallel = const_parallel_downloads()
    verbose = const_verbose()
    discover = const_discover()
    use_process_pool = const_use_process_pool()
    download_ext_ids_with_forums = const_download_ext_ids_with_forums()
    download_ext_ids_without_forums = const_download_ext_ids_without_forums()
    ext_timeout = const_ext_timeout()
    max_discover = None
    try:
        opts, _ = getopt.getopt(
            argv, "hsdFNPa:p:t:",
            ["timeout=", "archive=", 'parallel=', 'max-discover='])
    except getopt.GetoptError:
        helpmsg()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            helpmsg()
            sys.exit()
        elif opt in ("-a", "--archive"):
            basedir = arg
        elif opt in ("-p", "--parallel"):
            parallel = int(arg)
        elif opt in ("-t", "--timeout"):
            ext_timeout = int(arg)
        elif opt == '-s':
            verbose = False
        elif opt == '-P':
            use_process_pool = True
        elif opt == '-d':
            discover = True
        elif opt == '-F':
            download_ext_ids_with_forums = False
        elif opt == '-N':
            download_ext_ids_without_forums = False
        elif opt == '--max-discover':
            discover = True
            max_discover = int(arg)
    return basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, use_process_pool


def main(argv):
    """Main function of the extension crawler."""

    today = datetime.datetime.now(datetime.timezone.utc).isoformat()
    basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, use_process_pool = parse_args(
        argv)

    if verbose:
        loglevel = logging.INFO
    else:
        loglevel = logging.WARNING

    logger = logging.getLogger()
    ch = logging.StreamHandler(sys.stdout)
    ch.setFormatter(logging.Formatter(const_log_format()))
    logger.addHandler(ch)
    logger.setLevel(loglevel)

    # Surpressing these "Starting HTTPS connection ..." log messages
    # Older versions of requests use loglevel INFO for that, newer ones DEBUG
    logging.getLogger("requests").setLevel(logging.WARNING)

    archive_dir = os.path.join(basedir, "data")
    os.makedirs(archive_dir, exist_ok=True)
    conf_dir = os.path.join(basedir, "conf")
    os.makedirs(conf_dir, exist_ok=True)
    open(os.path.join(conf_dir, "forums.conf"), 'a').close()
    log_dir = os.path.join(basedir, "log")
    os.makedirs(log_dir, exist_ok=True)

    start_time = time.time()

    print_config(basedir, archive_dir, conf_dir, discover, parallel,
                 download_ext_ids_with_forums, download_ext_ids_without_forums,
                 ext_timeout, use_process_pool)

    forum_ext_ids = get_forum_ext_ids(conf_dir)
    known_ids = list(set(get_existing_ids(archive_dir)) | set(forum_ext_ids))
    discovered_ids = []
    if discover:
        log_info("Discovering new ids {}...".format("(at most {}) ".format(
            max_discover) if max_discover is not None else ""))
        try:
            discovered_ids = list(
                itertools.islice(get_new_ids(known_ids), max_discover))
        except Exception:
            log_exception("Exception when discovering new ids")
        log_info("Discovered {} new extensions".format(len(discovered_ids)), 1)

    ext_ids = list(set(discovered_ids) | set(known_ids))

    discovered_ids = None
    known_ids = None

    if download_ext_ids_with_forums:
        if not download_ext_ids_without_forums:
            ext_ids = []
    else:
        if download_ext_ids_without_forums:
            forum_ext_ids = []
        else:
            # download neither type of extensions
            ext_ids = []
            forum_ext_ids = []

    res = update_extensions(archive_dir, parallel, forum_ext_ids, ext_ids,
                            ext_timeout, use_process_pool)

    # We re-try (once) the extensions with unknown exceptions, as
    # they are often temporary
    has_exception = list(filter(lambda x: x.has_exception(), res))
    if has_exception != []:
        log_info(
            "  {} extensions with unknown exceptions, start another try ...".
            format(str(len(has_exception))))
        has_exception_ids = list(map(lambda x: x.id, has_exception))
        forum_ext_ids_except = list(
            set(forum_ext_ids).intersection(set(has_exception_ids)))
        ext_ids_except = sorted(
            list(set(has_exception_ids) - set(forum_ext_ids_except)))
        res_update = update_extensions(archive_dir, parallel,
                                       forum_ext_ids_except, ext_ids_except, ext_timeout, use_process_pool)
        res = list(set(res) - set(has_exception)) + res_update

    end_time = time.time()
    log_summary(res, end_time - start_time)
    log_failures_to_file(log_dir, today, res)


if __name__ == "__main__":
    main(sys.argv[1:])