diff --git a/crawler b/crawler index ce50003..2d0c953 100755 --- a/crawler +++ b/crawler @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # # Copyright (C) 2016,2017 The University of Sheffield, UK -# +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -18,83 +18,78 @@ import os import sys -import glob -import re -import requests -from time import sleep -from random import randint +import sqlite3 import datetime -from ExtensionCrawler.discover import * -from ExtensionCrawler.archive import * -from ExtensionCrawler.util import * -from ExtensionCrawler.discover import * -import dateutil -import dateutil.parser import time import getopt -import sqlite3 - +from functools import reduce +import dateutil +import dateutil.parser +from ExtensionCrawler.discover import get_new_ids +from ExtensionCrawler.archive import * +from ExtensionCrawler.util import * # Script should run with python 3.4 or 3.5 assert sys.version_info >= (3, 4) and sys.version_info < (3, 6) -def write_log(dir, fname, text): - os.makedirs(dir, exist_ok=True) - with open(os.path.join(dir, fname), 'w') as f: +def write_log(dirname, fname, text): + os.makedirs(dirname, exist_ok=True) + with open(os.path.join(dirname, fname), 'w') as f: f.write(text) -def log_failures_to_file(dir, today, res): +def log_failures_to_file(dirname, today, res): not_authorized = reduce( lambda x, y: x + "\n" + y, sorted(map(lambda x: x.id, filter(lambda x: x.not_authorized(), res))), "") - write_log(dir, today + "-not-authorized.log", not_authorized) + write_log(dirname, today + "-not-authorized.log", not_authorized) updated = reduce( lambda x, y: x + "\n" + y, sorted( map(lambda x: x.id, filter(lambda x: x.is_ok() and not x.not_modified(), res))), "") - write_log(dir, today + "-updated.log", updated) + write_log(dirname, today + "-updated.log", updated) has_exception = reduce( lambda x, y: x + "\n" + y, sorted(map(lambda x: x.id, filter(lambda x: x.has_exception(), res))), "") - write_log(dir, today + "-raised-exception.log", has_exception) + write_log(dirname, today + "-raised-exception.log", has_exception) raised_ddos = reduce( lambda x, y: x + "\n" + y, sorted( map(lambda x: x.id, filter(lambda x: x.raised_google_ddos(), res))), "") - write_log(dir, today + "-raised-ddos.log", raised_ddos) + write_log(dirname, today + "-raised-ddos.log", raised_ddos) not_in_store = reduce( lambda x, y: x + "\n" + y, sorted(map(lambda x: x.id, filter(lambda x: x.not_in_store(), res))), "") - write_log(dir, today + "-not-in-store.log", not_in_store) + write_log(dirname, today + "-not-in-store.log", not_in_store) new = reduce( lambda x, y: x + "\n" + y, sorted(map(lambda x: x.id, filter(lambda x: x.is_new(), res))), "") - write_log(dir, today + "-new-in-store.log", new) + write_log(dirname, today + "-new-in-store.log", new) file_corruption = reduce( lambda x, y: x + "\n" + y, sorted(map(lambda x: x.id, filter(lambda x: x.corrupt_tar(), res))), "") - write_log(dir, today + "-file-corruption.log", file_corruption) - + write_log(dirname, today + "-file-corruption.log", file_corruption) + sql_exception = reduce( lambda x, y: x + "\n" + y, sorted(map(lambda x: x.id, filter(lambda x: x.sql_exception(), res))), "") - write_log(dir, today + "-sql-exception.log", sql_exception) - + write_log(dirname, today + "-sql-exception.log", sql_exception) + sql_success = reduce( lambda x, y: x + "\n" + y, - sorted(map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))), + sorted( + map(lambda x: x.id, filter(lambda x: not x.sql_success(), res))), "") - write_log(dir, today + "-sql-not-updated.log", sql_success) + write_log(dirname, today + "-sql-not-updated.log", sql_success) def log_summary(verbose, res, stderr=False, runtime=0): @@ -136,7 +131,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): p(" Total runtime: {}\n".format( str(datetime.timedelta(seconds=int(runtime))))) - if not (corrupt_tar_archives == []): + if (corrupt_tar_archives != []): p("\n\n") p("List of extensions with corrupted files/archives:\n") list( @@ -145,7 +140,7 @@ def log_summary(verbose, res, stderr=False, runtime=0): p("\n") -def help(): +def helpmsg(): print("crawler [OPTION]") print(" -h print this help text") print(" -s silent (no log messages)") @@ -160,13 +155,13 @@ def main(argv): verbose = True discover = False try: - opts, args = getopt.getopt(argv, "hsda:p:", ["archive=",'parallel=']) + opts, args = getopt.getopt(argv, "hsda:p:", ["archive=", 'parallel=']) except getopt.GetoptError: - help() + helpmsg() sys.exit(2) for opt, arg in opts: if opt == '-h': - help() + helpmsg() sys.exit() elif opt in ("-a", "--archive"): basedir = arg @@ -189,11 +184,13 @@ def main(argv): log(verbose, "Configuration:\n") log(verbose, " Base dir: {}\n".format(basedir)) - log(verbose, " Archive directory: {}\n".format(archive_dir)) + log(verbose, + " Archive directory: {}\n".format(archive_dir)) log(verbose, " Configuration directory: {}\n".format(conf_dir)) log(verbose, " Discover new extensions: {}\n".format(discover)) log(verbose, " Max num. of concurrent downloads: {}\n".format(parallel)) - log(verbose, " SQLite 3 version: {}\n".format(sqlite3.sqlite_version)) + log(verbose, " SQLite 3 version: {}\n".format( + sqlite3.sqlite_version)) log(verbose, "\n") forum_ext_ids = get_forum_ext_ids(conf_dir, verbose) @@ -204,12 +201,17 @@ def main(argv): discovered_ids = get_new_ids(verbose, known_ids) ext_ids = list(set(discovered_ids) | set(known_ids)) - res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, ext_ids) + discovered_ids = None + known_ids = None + existing_ids = None - # We re-try (once) the extensions with unknown exceptions, as - # they are often temporary + res = update_extensions(archive_dir, verbose, parallel, forum_ext_ids, + ext_ids) + + # We re-try (once) the extensions with unknown exceptions, as + # they are often temporary has_exception = list(filter(lambda x: x.has_exception(), res)) - if not (has_exception == []): + if (has_exception != []): log(verbose, " {} extensions with unknown exceptions, start another try ...\n". format(str(len(has_exception))))