Fixed style errors and warnings.

2018-04-21 19:00:07 +01:00 · 2018-04-21 19:00:07 +01:00 · a789fe505f
parent ac3c1c7f20
commit a789fe505f
19 changed files with 168 additions and 174 deletions
--- a/ExtensionCrawler/archive.py
+++ b/ExtensionCrawler/archive.py
@ -23,7 +23,6 @@ import os
 import glob
 import re
 import json
-from multiprocessing import Pool
 from concurrent.futures import TimeoutError
 from pebble import ProcessPool, ProcessExpired
 from functools import partial
@ -44,6 +43,7 @@ from ExtensionCrawler.config import (
 from ExtensionCrawler.util import google_dos_protection, value_of, log_info, log_warning, log_exception, setup_logger
 from ExtensionCrawler.db import update_db_incremental

+
 class Error(Exception):
    pass

@ -82,11 +82,11 @@ class RequestResult:


 class UpdateResult:
-    def __init__(self, id, is_new, exception, res_overview, res_crx,
+    def __init__(self, ext_id, is_new, exception, res_overview, res_crx,
                 res_reviews, res_support, res_sql, sql_update, worker_exception=None):
-        self.id = id
+        self.ext_id = ext_id
        self.new = is_new
-        self.exception = exception # TODO: should be tar_exception
+        self.exception = exception  # TODO: should be tar_exception
        self.res_overview = res_overview
        self.res_crx = res_crx
        self.res_reviews = res_reviews
@ -188,7 +188,7 @@ def last_modified_http_date(path):


 def last_crx(archivedir, extid, date=None):
-    last_crx = ""
+    last_crx_path = ""
    last_crx_etag = ""

    etag_file = os.path.join(archivedir, get_local_archive_dir(extid),
@ -198,14 +198,13 @@ def last_crx(archivedir, extid, date=None):
            with open(etag_file, 'r') as f:
                d = json.load(f)
                return d["last_crx"], d["last_crx_etag"]
-        except Exception as e:
+        except Exception:
            log_exception("Something was wrong with the etag file {}, deleting it ...".format(etag_file))
            try:
                os.remove(etag_file)
-            except Exception as e:
+            except Exception:
                log_exception("Could not remove etag file {}!".format(etag_file))

-
    # If we do not yet have an .etag file present, open the tarfile and look
    # there for one. After having done that once, the crawler creates the .etag
    # file to avoid opening the tar file in the future.
@ -219,23 +218,23 @@ def last_crx(archivedir, extid, date=None):
                    date is None or (dateutil.parser.parse(
                        os.path.split(os.path.split(x.name)[0])[1]) <= date))
            ])
-            if old_crxs != []:
-                last_crx = old_crxs[-1]
+            if old_crxs:
+                last_crx_path = old_crxs[-1]
                headers_content = t.extractfile(
-                    last_crx + ".headers").read().decode().replace(
+                    last_crx_path + ".headers").read().decode().replace(
                        '"', '\\"').replace("'", '"')
                headers_json = json.loads(headers_content)
                last_crx_etag = headers_json["ETag"]

                if date is None:
                    with open(etag_file, 'w') as f:
-                        json.dump({"last_crx": last_crx, "last_crx_etag": last_crx_etag}, f)
+                        json.dump({"last_crx": last_crx_path, "last_crx_etag": last_crx_etag}, f)

-    return last_crx, last_crx_etag
+    return last_crx_path, last_crx_etag


 def first_crx(archivedir, extid, date=None):
-    first_crx = ""
+    first_crx_path = ""
    tar = os.path.join(archivedir, get_local_archive_dir(extid),
                       extid + ".tar")
    if os.path.exists(tar):
@ -247,10 +246,10 @@ def first_crx(archivedir, extid, date=None):
                    os.path.split(os.path.split(x.name)[0])[1])))
        ])
        t.close()
-        if old_crxs != []:
-            first_crx = old_crxs[0]
+        if old_crxs:
+            first_crx_path = old_crxs[0]

-    return first_crx
+    return first_crx_path


 def all_crx(archivedir, extid, date=None):
@ -283,7 +282,7 @@ def update_overview(tar, date, ext_id):

 def validate_crx_response(res, extid, extfilename):
    regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
-    if not 'Content-Type' in res.headers:
+    if 'Content-Type' not in res.headers:
        raise CrawlError(extid, 'Did not find Content-Type header.', '\n'.join(
            res.iter_lines()))
    if not res.headers['Content-Type'] == 'application/x-chrome-extension':
@ -351,10 +350,12 @@ def update_crx(archivedir, tmptardir, ext_id, date):
                        f.write(chunk)
            write_text(tmptardir, date, extfilename + ".etag",
                       res.headers.get("ETag"))
-            etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id),
-                                    ext_id + ".etag")
+            etag_file = os.path.join(archivedir, get_local_archive_dir(ext_id), ext_id + ".etag")
            with open(etag_file, 'w') as f:
-                json.dump({"last_crx": os.path.join(ext_id, date, extfilename), "last_crx_etag": res.headers.get("ETag")}, f)
+                json.dump({
+                              "last_crx": os.path.join(ext_id, date, extfilename),
+                              "last_crx_etag": res.headers.get("ETag")
+                          }, f)
    except Exception as e:
        log_exception("Exception when updating crx", 3, ext_id)
        write_text(tmptardir, date, extfilename + ".exception",
@ -367,9 +368,10 @@ def iterate_authors(pages):
    for page in pages:
        json_page = json.loads(page[page.index("{\""):page.rindex("}}},") + 1])
        for annotation in json_page["annotations"]:
-            if "attributes" in annotation and "replyExists" in annotation["attributes"] and annotation["attributes"]["replyExists"]:
-                yield (annotation["entity"]["author"],
-                       annotation["entity"]["groups"])
+            if "attributes" in annotation:
+                if "replyExists" in annotation["attributes"]:
+                    if annotation["attributes"]["replyExists"]:
+                        yield (annotation["entity"]["author"], annotation["entity"]["groups"])


 def update_reviews(tar, date, ext_id):
@ -550,7 +552,7 @@ def update_extension(archivedir, forums, ext_id):
        try:
            write_text(tardir, date, ext_id + ".sql.exception",
                       traceback.format_exc())
-        except Exception as e:
+        except Exception:
            pass
    try:
        shutil.rmtree(path=tmpdir)
@ -581,13 +583,11 @@ def init_process(verbose, start_pystuck=False):
        pystuck.run_server(port=((os.getpid() % 10000) + 10001))


-def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
-    results=[]
-    with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process, initargs=(verbose, start_pystuck)) as pool:
-        future = pool.map(partial(update_extension, archivedir, forums),
-                            ext_ids,
-                            chunksize=1,
-                            timeout=timeout)
+def execute_parallel(archivedir, timeout, max_workers, ext_ids, forums, verbose, start_pystuck):
+    results = []
+    with ProcessPool(max_workers=max_workers, max_tasks=100, initializer=init_process,
+                     initargs=(verbose, start_pystuck)) as pool:
+        future = pool.map(partial(update_extension, archivedir, forums), ext_ids, chunksize=1, timeout=timeout)
        iterator = future.result()
        for ext_id in ext_ids:
            try:
@ -595,7 +595,7 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum
            except StopIteration:
                break
            except TimeoutError as error:
-                log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id,error.args[1]))
+                log_warning("WorkerException: Processing of %s took longer than %d seconds" % (ext_id, error.args[1]))
                results.append(UpdateResult(ext_id, False, None, None, None, None, None, None, None, error))
            except ProcessExpired as error:
                log_warning("WorkerException: %s (%s), exit code: %d" % (error, ext_id, error.exitcode))
@ -609,8 +609,6 @@ def execute_parallel(archivedir, max_retry, timeout, max_workers, ext_ids, forum


 def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, verbose, start_pystuck):
-    ext_with_forums = []
-    ext_without_forums = []
    forums_ext_ids = (list(set(forums_ext_ids)))

    log_info("Updating {} extensions ({} including forums)".format(
@ -621,13 +619,13 @@ def update_extensions(archivedir, parallel, forums_ext_ids, ext_ids, timeout, ve
    parallel_ids = ext_ids
    log_info("Updating {} extensions excluding forums (parallel)".format(
        len(parallel_ids)), 1)
-    ext_without_forums = execute_parallel(archivedir, 3, timeout, parallel, parallel_ids, False, verbose, start_pystuck)
+    ext_without_forums = execute_parallel(archivedir, timeout, parallel, parallel_ids, False, verbose, start_pystuck)

    # Second, update extensions with forums sequentially (and with delays) to
    # avoid running into Googles DDOS detection.
    log_info("Updating {} extensions including forums (sequentially)".format(
        len(forums_ext_ids)), 1)
-    ext_with_forums = execute_parallel(archivedir, 3, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)
+    ext_with_forums = execute_parallel(archivedir, timeout, 1, forums_ext_ids, True, verbose, start_pystuck)

    return ext_with_forums + ext_without_forums

--- a/ExtensionCrawler/cdnjs_crawler.py
+++ b/ExtensionCrawler/cdnjs_crawler.py
@ -169,10 +169,10 @@ def update_lib(force, archive, lib):
        outphased = []
        for lib_ver in local_lib_json['assets']:
            version = lib_ver['version']
-            if not version in cdnjs_versions:
+            if version not in cdnjs_versions:
                logging.warning("Found outphased versions for " + name + " " +
                                str(version) + " , preserving from archive.")
-                if not 'outphased' in lib_ver:
+                if 'outphased' not in lib_ver:
                    lib_ver[
                        'outphased'] = datetime.datetime.utcnow().isoformat()
                outphased.append(lib_ver)
@ -260,7 +260,7 @@ def delete_orphaned(archive, local_libs, cdnjs_current_libs):
    """Delete all orphaned local libaries."""
    dirname = os.path.join(archive, "filedb", "cdnjs", "lib")
    for lib in local_libs:
-        if not lib in cdnjs_current_libs:
+        if lib not in cdnjs_current_libs:
            os.remove(os.path.join(dirname, lib + ".json"))


--- a/ExtensionCrawler/cdnjs_git.py
+++ b/ExtensionCrawler/cdnjs_git.py
@ -25,8 +25,7 @@ import logging
 import os
 import re
 import sys
-from functools import partial, reduce
-from multiprocessing import Pool
+from functools import reduce

 import dateutil.parser
 import git
@ -70,8 +69,8 @@ def pull_list_changed_files(git_path):
        for diff in single_fetch_info.commit.diff(
                single_fetch_info.old_commit):
            logging.debug("Found diff: " + str(diff))
-            if not diff.a_blob is None:
-                if not diff.a_blob.path in files:
+            if diff.a_blob is not None:
+                if diff.a_blob.path not in files:
                    files.append(diff.a_blob.path)
    return files

@ -98,7 +97,7 @@ def hackish_pull_list_changed_files(git_path):

    for line in pull_lines:
        match = re.search(r'^ (.+) \| .*$', line)
-        if not match is None:
+        if match is not None:
            changed_files = match.group(1).split('=>')
            for changed_file in changed_files:
                files.add(changed_file.strip())
@ -139,6 +138,7 @@ def get_file_libinfo(release_dic, git_path, libfile):
    file_info['library'] = lib
    file_info['version'] = version
    file_info['add_date'] = release_dic[(lib, version)]
+    # TODO: why is package not used?
    package = os.path.join(
        reduce(os.path.join, plist[:idx + 1]), "package.json")
    return file_info
@ -167,7 +167,7 @@ def get_all_lib_files(cdnjs_git_path, localpath=None):
    libvers = set()
    files = []
    versionidx = len(path_to_list(cdnjs_git_path)) + 4
-    if not localpath is None:
+    if localpath is not None:
        paths = os.path.join(cdnjs_git_path, localpath)
    else:
        paths = os.path.join(cdnjs_git_path, 'ajax/libs/**/*')
@ -196,7 +196,7 @@ def update_database_for_file(create_csv, release_dic, cdnjs_git_path, filename,
    if os.path.isfile(filename):
        logging.info("Updating database for file " + filename)
        file_info = get_file_libinfo(release_dic, cdnjs_git_path, filename)
-        if not file_info is None:
+        if file_info is not None:
            if create_csv:
                print(file_info['path'])
                print(cdnjs_git_path)
@ -268,7 +268,7 @@ def update_database_for_file_chunked(create_csv, release_dic, cdnjs_git_path,
    retries = 0
    success = False
    max_retries = 4
-    while (not success and (retries < max_retries)):
+    while not success and (retries < max_retries):
        try:
            update_database_for_file_chunked_timeout(create_csv, release_dic,
                                                     cdnjs_git_path, filenames)
@ -305,7 +305,7 @@ def get_release_triple(git_path, libver):
    lib = plist[-2]
    date = get_add_date(git_path, libver)
    logging.info("Release information:" + lib + " " + ver + ": " + str(date))
-    return (lib, ver, date)
+    return lib, ver, date


 def build_release_date_dic(git_path, libvers):
@ -332,7 +332,6 @@ def pull_and_update_db(cdnjs_git_path, create_csv):

 def update_db_from_listfile(cdnjs_git_path, listfile, create_csv):
    """Update database (without pull) for files in listfile)"""
-    paths = []
    with open(listfile) as listfileobj:
        paths = listfileobj.read().splitlines()
    files = []
--- a/ExtensionCrawler/config.py
+++ b/ExtensionCrawler/config.py
@ -142,10 +142,12 @@ def const_verbose():
    """Default verbosity."""
    return True

+
 def const_use_process_pool():
    """Use ProcessPool (from module 'pebble') for concurrency."""
    return False

+
 def const_log_format():
    return '%(process)6s %(asctime)s %(levelname)8s %(message)s'

@ -154,14 +156,17 @@ def const_discover():
    """Default configuration of discovery mode"""
    return False

+
 def const_download_ext_ids_with_forums():
    """Download extensions with forums (sequential mode)"""
    return True

+
 def const_download_ext_ids_without_forums():
    """Download extensions without forums (parallel mode)"""
    return True

+
 def const_ext_timeout():
    """Timeout for downloading an individual extension (2 hours)."""
    return 2*60*60
--- a/ExtensionCrawler/db.py
+++ b/ExtensionCrawler/db.py
@ -15,13 +15,12 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #

-from ExtensionCrawler.config import *
-from ExtensionCrawler.util import *
-from ExtensionCrawler.crx import *
-from ExtensionCrawler.archive import *
-from ExtensionCrawler.js_decomposer import decompose_js_with_connection, DetectionType, FileClassification
+from ExtensionCrawler.config import const_mysql_config_file
+from ExtensionCrawler.crx import read_crx
+from ExtensionCrawler.js_decomposer import decompose_js_with_connection
+from ExtensionCrawler.util import log_warning, log_debug, log_exception, log_info

-from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
+from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend, convert_date

 import re
 from bs4 import BeautifulSoup
@ -63,7 +62,7 @@ def get_etag(ext_id, datepath, con):
            link = f.read()
            linked_date = link[3:].split("/")[0]

-            result = con.get_etag(ext_id, con.convert_date(linked_date))
+            result = con.get_etag(ext_id, convert_date(linked_date))
            if result is not None:
                return result

@ -166,7 +165,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
            con.insert(
                "extension",
                extid=ext_id,
-                date=con.convert_date(date),
+                date=convert_date(date),
                name=name,
                version=version,
                description=description,
@ -184,12 +183,12 @@ def parse_and_insert_overview(ext_id, date, datepath, con):
                    con.insert(
                        "category",
                        extid=ext_id,
-                        date=con.convert_date(date),
+                        date=convert_date(date),
                        category_md5=hashlib.md5(category.encode()).digest(),
                        category=category)


-def parse_and_insert_crx(ext_id, date, datepath, con):
+def parse_and_insert_crx(ext_id, datepath, con):
    crx_path = next(iter(glob.glob(os.path.join(datepath, "*.crx"))), None)
    if not crx_path:
        return
@ -314,7 +313,7 @@ def parse_and_insert_review(ext_id, date, reviewpath, con):
                    con.insert(
                        "review",
                        extid=ext_id,
-                        date=con.convert_date(date),
+                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(review, "timestamp")).isoformat()
                        if "timestamp" in review else None,
@ -345,7 +344,7 @@ def parse_and_insert_support(ext_id, date, supportpath, con):
                    con.insert(
                        "support",
                        extid=ext_id,
-                        date=con.convert_date(date),
+                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(review, "timestamp")).isoformat()
                        if "timestamp" in review else None,
@ -365,7 +364,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
    log_debug("- parsing reply file", 3, ext_id)
    with open(repliespath) as f:
        d = json.load(f)
-        if not "searchResults" in d:
+        if "searchResults" not in d:
            log_warning("* WARNING: there are no search results in {}".format(
                repliespath), 3, ext_id)
            return
@ -379,7 +378,7 @@ def parse_and_insert_replies(ext_id, date, repliespath, con):
                    con.insert(
                        "reply",
                        extid=ext_id,
-                        date=con.convert_date(date),
+                        date=convert_date(date),
                        commentdate=datetime.datetime.utcfromtimestamp(
                            get(annotation, "timestamp")).isoformat()
                        if "timestamp" in annotation else None,
@ -413,7 +412,7 @@ def parse_and_insert_status(ext_id, date, datepath, con):
    con.insert(
        "status",
        extid=ext_id,
-        date=con.convert_date(date),
+        date=convert_date(date),
        crx_status=crx_status,
        overview_status=overview_status,
        overview_exception=overview_exception)
@ -439,8 +438,8 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):

    if etag:
        try:
-            parse_and_insert_crx(ext_id, date, datepath, con)
-        except Exception as e:
+            parse_and_insert_crx(ext_id, datepath, con)
+        except Exception:
            log_exception("Exception when parsing crx", 3, ext_id)
    else:
        crx_status = get_crx_status(datepath)
@ -449,40 +448,40 @@ def update_db_incremental_with_connection(tmptardir, ext_id, date, con):

    try:
        parse_and_insert_overview(ext_id, date, datepath, con)
-    except Exception as e:
+    except Exception:
        log_exception("Exception when parsing overview", 3, ext_id)

    try:
        parse_and_insert_status(ext_id, date, datepath, con)
-    except Exception as e:
+    except Exception:
        log_exception("Exception when parsing status", 3, ext_id)

    reviewpaths = glob.glob(os.path.join(datepath, "reviews*-*.text"))
    for reviewpath in reviewpaths:
        try:
            parse_and_insert_review(ext_id, date, reviewpath, con)
-        except json.decoder.JSONDecodeError as e:
+        except json.decoder.JSONDecodeError:
            log_warning("- WARNING: Review is not a proper json file!", 3,
                        ext_id)
-        except Exception as e:
+        except Exception:
            log_exception("Exception when parsing review", 3, ext_id)

    supportpaths = glob.glob(os.path.join(datepath, "support*-*.text"))
    for supportpath in supportpaths:
        try:
            parse_and_insert_support(ext_id, date, supportpath, con)
-        except json.decoder.JSONDecodeError as e:
+        except json.decoder.JSONDecodeError:
            log_warning("- WARNING: Support is not a proper json file!", 3,
                        ext_id)
-        except Exception as e:
+        except Exception:
            log_exception("Exception when parsing support", 3, ext_id)

    repliespaths = glob.glob(os.path.join(datepath, "*replies.text"))
    for repliespath in repliespaths:
        try:
            parse_and_insert_replies(ext_id, date, repliespath, con)
-        except json.decoder.JSONDecodeError as e:
+        except json.decoder.JSONDecodeError:
            log_warning("- WARNING: Reply is not a proper json file!", 3,
                        ext_id)
-        except Exception as e:
+        except Exception:
            log_exception("Exception when parsing reply", 3, ext_id)
--- a/ExtensionCrawler/dbbackend/mysql_backend.py
+++ b/ExtensionCrawler/dbbackend/mysql_backend.py
@ -18,18 +18,17 @@
 import time
 import datetime
 from random import uniform
-from itertools import starmap
-import logging

 import MySQLdb
 import _mysql_exceptions

 import ExtensionCrawler.config as config
-from ExtensionCrawler.util import log_info, log_error, log_exception, log_warning
+from ExtensionCrawler.util import log_info, log_error, log_warning


 class MysqlBackend:
-    def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(), **kwargs):
+    def __init__(self, ext_id, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(),
+                 **kwargs):
        self.ext_id = ext_id
        self.dbargs = kwargs
        self.try_wait = try_wait
@ -147,5 +146,6 @@ class MysqlBackend:
        result = self.retry(lambda: self.cursor.fetchone())
        return result

-    def convert_date(self, date):
-        return date[:-6]
+
+def convert_date(date):
+    return date[:-6]
--- a/ExtensionCrawler/discover.py
+++ b/ExtensionCrawler/discover.py
@ -17,12 +17,10 @@
 """Python mnodule providing methods for discovering extensions in the
   Chrome extension store."""

-import xml.etree.ElementTree as ET
+from xml.etree.ElementTree import fromstring
 import re
-from functools import reduce
 import requests
 from ExtensionCrawler import config
-from ExtensionCrawler.util import log_info, log_exception


 def crawl_nearly_all_of_ext_ids():
@ -30,7 +28,7 @@ def crawl_nearly_all_of_ext_ids():

    def get_inner_elems(doc):
        """Get inner element."""
-        return ET.fromstring(doc).iterfind(r".//{{{}}}loc".format(
+        return fromstring(doc).iterfind(r".//{{{}}}loc".format(
            config.const_sitemap_scheme()))

    def is_generic_url(url):
--- a/ExtensionCrawler/file_identifiers.py
+++ b/ExtensionCrawler/file_identifiers.py
@ -30,12 +30,14 @@ import magic

 from ExtensionCrawler.js_mincer import mince_js

+
 def is_binary_resource(mimetype_magic):
    return (mimetype_magic.startswith("image/") or
            mimetype_magic.startswith("video/") or
            mimetype_magic.startswith("audio/") or
            mimetype_magic == "application/pdf")

+
 def normalize_jsdata(str_data):
    """Compute normalized code blocks of a JavaScript file"""
    txt = ""
@ -59,9 +61,8 @@ def get_features(s):

 def get_simhash(encoding, data):
    """Compute simhash of text."""
-    str_data = ""
-    if not encoding is None:
-        str_data = data.decode(encoding=encoding,errors="replace")
+    if encoding is not None:
+        str_data = data.decode(encoding=encoding, errors="replace")
    else:
        str_data = str(data)
    simhash = Simhash(get_features(str_data)).value
@ -82,31 +83,30 @@ def compute_difference(hx, hy):
 def get_data_identifiers(data):
    """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""

-    data_identifier = {}
-
-    data_identifier['encoding'] = None
-    data_identifier['description'] = None
-    data_identifier['size'] = None
-    data_identifier['loc'] = None
-    data_identifier['mimetype_magic'] = None
-    data_identifier['md5'] = None
-    data_identifier['sha1'] = None
-    data_identifier['sha256'] = None
-    data_identifier['simhash'] = None
-    data_identifier['size_stripped'] = None
-    data_identifier['normalized_encoding'] = None
-    data_identifier['normalized_description'] = None
-    data_identifier['normalized_size'] = None
-    data_identifier['normalized_loc'] = None
-    data_identifier['normalized_mimetype_magic'] = None
-    data_identifier['normalized_md5'] = None
-    data_identifier['normalized_sha1'] = None
-    data_identifier['normalized_sha256'] = None
-    data_identifier['normalized_simhash'] = None
+    data_identifier = {
+        'encoding': None,
+        'description': None,
+        'size': None,
+        'loc': None,
+        'mimetype_magic': None,
+        'md5': None,
+        'sha1': None,
+        'sha256': None,
+        'simhash': None,
+        'size_stripped': None,
+        'normalized_encoding': None,
+        'normalized_description': None,
+        'normalized_size': None,
+        'normalized_loc': None,
+        'normalized_mimetype_magic': None,
+        'normalized_md5': None,
+        'normalized_sha1': None,
+        'normalized_sha256': None,
+        'normalized_simhash': None
+    }

    mimetype_magic = magic.from_buffer(data, mime=True)

-    magic_desc = ""
    try:
        magic_desc = magic.from_buffer(data)
    except magic.MagicException as exp:
@ -137,9 +137,10 @@ def get_data_identifiers(data):
    data_identifier['encoding'] = encoding
    try:
        normalized_data, normalized_loc = normalize_jsdata(
-            data.decode(encoding=data_identifier['encoding'],errors="replace"))
+            data.decode(encoding=data_identifier['encoding'], errors="replace"))
    except Exception:
        normalized_data = None
+        normalized_loc = 0

    if normalized_data is not None:
        normalized_magic_desc = ""
@ -149,7 +150,7 @@ def get_data_identifiers(data):
            rgx = re.compile(r' name use count.*$')
            msg = str(exp.message)
            if re.search(rgx, msg):
-                magic_desc = re.sub(rgx, '', msg)
+                normalized_magic_desc = re.sub(rgx, '', msg)
            else:
                raise exp
        normalized_encoding = chardet.detect(normalized_data)['encoding']
--- a/ExtensionCrawler/js_decomposer.py
+++ b/ExtensionCrawler/js_decomposer.py
@ -18,12 +18,10 @@
   general and Chrome extensions in particular."""

 import os
-import io
 from io import StringIO
 import re
 import json
 import zlib
-import logging
 from enum import Enum
 from ExtensionCrawler.js_mincer import mince_js
 from ExtensionCrawler.file_identifiers import get_file_identifiers, is_binary_resource
@ -107,15 +105,15 @@ def unknown_lib_identifiers():
        re.compile(
            r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\sv?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
            re.IGNORECASE
-        ),  #MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
+        ),  # MatchType: name version, e.g. mylib v1.2.9b or mylib.anything 1.2.8
        re.compile(
            r'[\/|\/\/|\s]\*?\s?([a-zA-Z0-9\.]+)\s(?: version)\:?\s?v?([0-9][0-9.a-z_\\\\-]+)',
            re.IGNORECASE
-        ),  #MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
+        ),  # MatchType: name version: ver, e.g. mylib version: v1.2.9, or mylib.js version 1.2.8
        re.compile(
            r'\@*(version)\s?[\:|-]?\s?v?([0-9][\.|\-|\_][0-9.a-z_\\\\-]+)',
            re.IGNORECASE
-        ),  #MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
+        ),  # MatchType: version x.x.x, e.g. @version: 1.2.5 or version - 1.2.5 etc.
        re.compile(
            r'(version)[\:|\=]\s?.?([0-9]{1,2}[\.|\-|\_][0-9.a-z_\\\\-]+).?',
            re.IGNORECASE),
@ -188,13 +186,9 @@ def check_md5_decompressed(con, file_info):
    """Check for known md5 hash (decompressed file content)."""
    if con is None:
        return file_info
-    if file_info['dec_md5'] is None:
-        return file_info
-    else:
+    if file_info['dec_md5'] is not None:
        libver = con.get_cdnjs_info(file_info['dec_md5'])
-        if libver is None:
-            return file_info
-        else:
+        if libver is not None:
            file_info['lib'] = libver[0]
            file_info['version'] = libver[1]
            file_info['lib_filename'] = libver[2]
@ -203,7 +197,6 @@ def check_md5_decompressed(con, file_info):
            else:
                file_info['type'] = FileClassification.LIBRARY
            file_info['detectionMethod'] = DetectionType.MD5_DECOMPRESSED
-            return file_info
    return file_info


@ -361,7 +354,7 @@ def analyse_comment_known_libs(zipfile, js_file, js_info, comment):
    else:
        filename = js_file
    for lib, regex in load_lib_identifiers().items():
-        if ('filecontent' in regex):
+        if 'filecontent' in regex:
            for unkregex in regex['filecontent']:
                unkown_lib_matched = unkregex.finditer(comment.content)
                for match in unkown_lib_matched:
@ -481,13 +474,14 @@ def decompose_js_with_connection(path_or_zipfileobj, con):
                try:
                    str_data = data.decode(file_info['encoding'])
                except Exception:
-                    log_info("Exception during data decoding for entry " +
-                                 file_info['filename'], 3)
+                    log_info("Exception during data decoding for entry " + file_info['filename'], 3)
                    str_data = ''
            else:
                str_data = ''

            info_data_blocks = check_data_blocks(file_info, str_data)
+        else:
+            info_data_blocks = None

        if info_data_blocks:
            inventory = inventory + merge_filename_and_data_info(
--- a/ExtensionCrawler/js_mincer.py
+++ b/ExtensionCrawler/js_mincer.py
@ -198,8 +198,8 @@ def mince_js_fileobj(fileobj):
                        except StopIteration:
                            pass

-        if ((is_comment(state) and is_code_or_string_literal(suc_state)) or
-            (is_code_or_string_literal(state) and is_comment(suc_state))):
+        if ((is_comment(state) and is_code_or_string_literal(suc_state)) or (
+                is_code_or_string_literal(state) and is_comment(suc_state))):
            if content.strip():
                yield (JsBlock(state, (block_start_line, block_start_cpos),
                               (line, cpos), content, string_literals))
--- a/2
+++ b/2
@ -107,7 +107,7 @@ def main(argv):
        logging.info("Starting update of new db libs")
        pull_and_update_db(cdnjs_git_path, csv)
        logging.info("Finished update of new db libs")
-    if not listfile is None:
+    if listfile is not None:
        logging.info("Starting update from list file")
        update_db_from_listfile(cdnjs_git_path, listfile, csv)
        logging.info("Finished update from list file")
--- a/14
+++ b/14
@ -19,7 +19,6 @@
 A crawler for extensions from the Chrome Web Store.
 """

-import os
 import sys
 import datetime
 import time
@ -141,7 +140,7 @@ def log_summary(res, runtime=0):
    log_info("    Total runtime:            {}".format(
        str(datetime.timedelta(seconds=int(runtime)))))

-    if corrupt_tar_archives != []:
+    if corrupt_tar_archives:
        log_info("")
        log_info("List of extensions with corrupted files/archives:")
        list(
@ -229,7 +228,8 @@ def parse_args(argv):
            max_discover = int(arg)
        elif opt == '--pystuck':
            start_pystuck = True
-    return basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck
+    return [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums,
+            download_ext_ids_without_forums, ext_timeout, start_pystuck]


 def main(argv):
@ -242,8 +242,8 @@ def main(argv):
    multiprocessing.set_start_method("forkserver")

    today = datetime.datetime.now(datetime.timezone.utc).isoformat()
-    basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums, ext_timeout, start_pystuck = parse_args(
-        argv)
+    [basedir, parallel, verbose, discover, max_discover, download_ext_ids_with_forums, download_ext_ids_without_forums,
+     ext_timeout, start_pystuck] = parse_args(argv)

    setup_logger(verbose)

@ -304,7 +304,7 @@ def main(argv):
    # We re-try (once) the extensions with unknown exceptions, as
    # they are often temporary
    has_exception = list(filter(lambda x: x.has_exception(), res))
-    if has_exception != []:
+    if has_exception:
        log_info(
            "  {} extensions with unknown exceptions, start another try ...".
            format(str(len(has_exception))))
@ -318,7 +318,7 @@ def main(argv):
        res = list(set(res) - set(has_exception)) + res_update

    end_time = time.time()
-    log_summary(res, end_time - start_time)
+    log_summary(res, int(end_time - start_time))
    log_failures_to_file(log_dir, today, res)


--- a/15
+++ b/15
@ -17,7 +17,6 @@
 #

 import getopt
-import os
 import sys
 import tarfile
 import time
@ -30,12 +29,12 @@ import datetime

 from ExtensionCrawler.archive import update_db_incremental
 from ExtensionCrawler.config import *
-from ExtensionCrawler.util import log_info, log_warning, log_error, log_exception
+from ExtensionCrawler.util import log_info, log_exception

 from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend


-def help():
+def print_help():
    print("""create-db [OPTION]""")
    print("""  -h                  print this help text""")
    print("""  -a <DIR>            archive directory""")
@ -122,11 +121,11 @@ def parse_args(argv):
            "maxtaskid=", "from-date=", "until-date=", "help"
        ])
    except getopt.GetoptError:
-        help()
+        print_help()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
-            help()
+            print_help()
            sys.exit()
        elif opt in ("-a", "--archive"):
            archive = arg
@ -140,12 +139,12 @@ def parse_args(argv):
            taskid = int(arg)
        elif opt in ("-N", "--maxtaskid"):
            maxtaskid = int(arg)
-        elif opt in ("--from-date"):
+        elif opt == "--from-date":
            from_date = arg
-        elif opt in ("--until-date"):
+        elif opt == "--until-date":
            until_date = arg

-    if paths == []:
+    if not paths:
        paths = list(find(archive, "*"))

    chunksize = int(len(paths) / maxtaskid)
--- a/1
+++ b/1
@ -58,7 +58,6 @@ def main(argv):
    basedir = const_basedir()
    verbose = True
    date = None
-    extid = ""
    useetag = False
    output = ""
    winfs = False
--- a/1
+++ b/1
@ -44,7 +44,6 @@ def main(argv):
    """Main function of the extension crawler."""
    verbose = False
    silent = False
-    filename = None
    csvfile = None
    database = True
    try:
--- a/33
+++ b/33
@ -39,7 +39,7 @@ import jsbeautifier

 from ExtensionCrawler.config import (const_log_format, const_basedir)
 from ExtensionCrawler.archive import last_crx, first_crx, all_crx
-from ExtensionCrawler.config import (archive_file, get_local_archive_dir)
+from ExtensionCrawler.config import get_local_archive_dir
 from ExtensionCrawler.js_decomposer import init_file_info
 from ExtensionCrawler.js_mincer import mince_js

@ -54,7 +54,7 @@ def is_file_with_c_style_comments(filename):

 def jsstrings_data(conf, path, data):
    """Analyze data in memory."""
-    if not conf.file_pattern is None:
+    if conf.file_pattern is not None:
        if path is None:
            return False
        elif not fnmatch.fnmatch(path, conf.file_pattern):
@ -98,7 +98,7 @@ def jsstrings_data(conf, path, data):
            if analyze_block(conf, path, block, first):
                match = True
                first = False
-    if match and conf.output_decoration > 0 and conf.output_decoration < 2:
+    if match and 0 < conf.output_decoration < 2:
        print(path)
    return match

@ -112,6 +112,7 @@ def print_block(conf,
    if conf.output_decoration > 1:
        line_no = block.start[0]
        prefix = " " * (block.start[1] - 1)
+        # TODO: use classifier
        classifier = "X"
        sep = "=" * (len(path) + 17)
        if not first:
@ -129,10 +130,10 @@ def print_block(conf,
                    path, loc, block.string_literals[0].rstrip())
                print(line)
            else:
-                for (pos, str) in block.string_literals:
+                for (pos, string) in block.string_literals:
                    loc = '({0[0]:d}/{0[1]:d})'.format(pos)
                    loc = (' ' * (11 - len(loc))) + loc
-                    line = '{0} {1} [L]: {2}'.format(path, loc, str.rstrip())
+                    line = '{0} {1} [L]: {2}'.format(path, loc, string.rstrip())
                    print(line)
            if code_match:
                print("-" * (len(path) + 17))
@ -151,7 +152,7 @@ def analyze_block(conf, path, block, first=False):
    """Print code/comment blocks."""
    match = False
    regexps = []
-    if not conf.reg_exp is None:
+    if conf.reg_exp is not None:
        for regexp in conf.reg_exp:
            if conf.case_insensitive:
                regexps.append(re.compile(r'(' + regexp + ')', re.IGNORECASE))
@ -159,7 +160,7 @@ def analyze_block(conf, path, block, first=False):
                regexps.append(re.compile(r'(' + regexp + ')'))
    if block.is_comment():
        content = block.content
-        if not conf.reg_exp_comments is None:
+        if conf.reg_exp_comments is not None:
            for regexp in conf.reg_exp_comments:
                if conf.case_insensitive:
                    regexps.append(
@ -179,14 +180,14 @@ def analyze_block(conf, path, block, first=False):
        content = block.content
        regexps_string = regexps.copy()
        regexps_code = regexps.copy()
-        if not conf.reg_exp_string_literals is None:
+        if conf.reg_exp_string_literals is not None:
            for regexp in conf.reg_exp_string_literals:
                if conf.case_insensitive:
                    regexps.append(
                        re.compile(r'(' + regexp + ')', re.IGNORECASE))
                else:
                    regexps.append(re.compile(r'(' + regexp + ')'))
-        if not conf.reg_exp_source is None:
+        if conf.reg_exp_source is not None:
            for regexp in conf.reg_exp_source:
                if conf.case_insensitive:
                    regexps.append(
@ -222,9 +223,9 @@ def analyze_block(conf, path, block, first=False):
                            match_idxs.add(idx)
                        string_match = True
            block.string_literals = []
-            for idx, str in enumerate(string_literals):
+            for idx, string in enumerate(string_literals):
                if idx in match_idxs:
-                    block.string_literals.append(str)
+                    block.string_literals.append(string)

        code_match = False
        for regexp in regexps_code:
@ -259,9 +260,6 @@ def analyze_crx(conf, crx, path=""):

 def analyze_tar(conf, tarfilename):
    last_crx_file = ''
-    #   from_date
-    #    latest_date
-    match = False
    extid = os.path.splitext(os.path.basename(tarfilename))[0]
    from_dateobj = None
    latest_dateobj = None
@ -303,7 +301,7 @@ def analyze_tar(conf, tarfilename):
            # both dates are given
            all_crx_files = all_crx(
                os.path.join(conf.archive_dir, "data"), extid)
-            if all_crx_files == []:
+            if not all_crx_files:
                logging.warning("No crx in  " + extid)
            else:
                with tarfile.open(tarfilename, 'r') as archive:
@ -354,14 +352,13 @@ def analyze_task(conf, task):
    """Analyze one file/tar/crx/extid."""
    logging.debug("Analyzing " + task)
    extid_re = re.compile('^[a-p]+$')
-    retval = False
    if task.endswith('.crx'):
        retval = analyze_crx(conf, task)
    elif task.endswith('.tar'):
        retval = analyze_tar(conf, task)
    elif extid_re.match(task):
-        tarfile = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
-        retval = analyze_tar(conf, conf.archive_dir + "/" + tarfile)
+        tarfilename = "data/" + get_local_archive_dir(task) + "/" + task + '.tar'
+        retval = analyze_tar(conf, conf.archive_dir + "/" + tarfilename)
    else:
        retval = analyze_file(conf, task)
    return retval
--- a/15
+++ b/15
@ -21,10 +21,12 @@ import glob
 import os
 import sys
 import logging
+import re

 from ExtensionCrawler import config

-def help():
+
+def print_help():
    print("""extfind [OPTION]""")
    print("""  -h                  print this help text""")
    print("""  -a <DIR>            archive directory""")
@ -51,7 +53,7 @@ def iter_extension_paths_from_file(archive, n, N, extidlistfile):
            if re.fullmatch("[a-p]{32}", line) and os.path.exists(path):
                paths += [path]
            else:
-                logging.warn("WARNING: {} is not a valid extension path!".format(path))
+                logging.warning("WARNING: {} is not a valid extension path!".format(path))
    return split(paths, n, N)


@ -67,19 +69,17 @@ def main(argv):
    taskid = 1
    maxtaskid = 1

-    paths = []
-
    try:
        opts, args = getopt.getopt(argv, "ha:g:e:n:N:", [
            "archive=", "glob=", "extidlistfile=", "taskid=",
            "maxtaskid=", "help"
        ])
    except getopt.GetoptError:
-        help()
+        print_help()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
-            help()
+            print_help()
            sys.exit()
        elif opt in ("-a", "--archive"):
            archive = arg
@ -99,11 +99,12 @@ def main(argv):
    elif extidglob is not None and extidlistfile is None:
        paths = iter_extension_paths(archive, taskid, maxtaskid, extidglob)
    else:
-        help()
+        print_help()
        sys.exit(2)

    for path in paths:
        print(path)

+
 if __name__ == "__main__":
    main(sys.argv[1:])
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,10 @@
+colorama==0.3.9
+pystuck==0.8.5
 simhash==1.8.0
 tabulate==0.7.7
 setuptools==36.2.7
 cchardet==2.1.1
-mysqlclient==1.3.10
+mysqlclient==1.3.12
 requests==2.18.1
 pycrypto==2.6.1
 beautifulsoup4==4.6.0
--- a/setup.py
+++ b/setup.py
@ -1,9 +1,12 @@
 from setuptools import setup

+with open('requirements.txt') as f:
+    requirements = f.read().splitlines()
+
 setup(
    name='Extension Crawler',
    description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
    author='Achim D. Brucker, Michael Herzberg',
    license='GPL 3.0',
-    install_requires=['GitPython', 'pebble', 'simhash', 'colorama', 'python_magic', 'tabulate', 'requests', 'pycrypto', 'beautifulsoup4', 'python_dateutil', 'mysqlclient', 'cchardet', 'jsbeautifier', 'pystuck']
+    install_requires=requirements
 )